Master Project

In [1]:
import pandas as pd
import numpy as np
import math
import copy
import random
import time
from scipy.interpolate import BSpline
import matplotlib.pyplot as plt
import matplotlib
import multiprocessing as mp
import cProfile
import re
import timeit
import warnings
import shutil
import os
from glob import glob
import gc
#import xlsxwriter
import pywt
import pycwt as wavelet
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_regression
from keras import models, Sequential, optimizers, backend
from keras.layers import core, Conv2D, pooling, Flatten, Dense, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from keras.utils import Sequence
import skimage
from skimage.io import imread
from skimage.transform import resize
Using TensorFlow backend.
In [11]:
# Increase TensorFlow's GPU memory usage in order to be able to run (0.9 seems to result in frequent crashes)
config = tf.ConfigProto()
config.gpu_options.allocator_type ='BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.9
In [12]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2718590210865397242
]

Carmenes data set

Import the data from the .dat files

In [5]:
def get_fileInfo():
    # Get the directory for each .dat file and store them in file_paths
    # Choose directory depending on the OS I'm running (similar code is found through the entire project)
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/git/carmenes/"
    else:
        directory = "/home/salomonsson/Master Project/git/carmenes/"
    
    file_paths = glob(directory + "*.dat")            
    # Get the .dat file names
    file_names = []
    for root, dirs, files in os.walk(directory):                                 
        for filename in files:
            if filename.endswith(".dat"):
                filename = filename[:-10]                     # Remove unneccessary "common" information from the name
                file_names.append(filename)
    return file_names, file_paths

file_names, file_paths = get_fileInfo()
In [6]:
def load_all_data():
    # Get the data from each .dat file
    all_loaded_data = []                                       # Create a list to store all the data. 
    for i in range(len(file_paths)):
        data = pd.read_table(file_paths[i], sep=" ", header=None)
        data.index.names = [file_names[i]]                     # Add the file name to the dataframe
        data.dropna(inplace=True)                              # Remove all NaN-values and their respective rows
    
        limits = (data[data.values =="#"])                     # Find all the indeces with a "#"
        data_pieces, lim_count = [], 0
        for i in range(len(limits)):
            start = limits.index[lim_count]
            if lim_count == (len(limits)-1):
                d_piece = data.loc[start:, :1]        # The last data piece contains the last order in the df data
            else:
                d_piece = data.loc[start:limits.index[lim_count+1], :1] # Select the correct data piece
        
            d_piece.index.names = [str(d_piece.index.names)[2:-2] + "_Order_" + str(lim_count)] # add order to name
            d_piece = d_piece[d_piece.iloc[:,0] != "#"]    # Remove all rows with a "#" in data_piece
            d_piece.columns = ["Angstrom", "Flux"]
            data_pieces.append(d_piece)
            lim_count += 1
    
        all_loaded_data.append(data_pieces)
    return all_loaded_data


all_loaded_data = load_all_data()
In [7]:
pd.options.display.max_rows = 50000
np.set_printoptions(threshold = 100000)

Plot some of the raw Carmenes data

first NIR

In [8]:
#Plot these:
a = 1
b = 2
c = 3

test_data1 = pd.read_table(file_paths[a], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data2 = pd.read_table(file_paths[b], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data3 = pd.read_table(file_paths[c], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
print(file_paths[a])
print(file_paths[b])
print(file_paths[c][-45:-4])
/home/salomonsson/Master Project/git/carmenes/car-20170609T20h33m03s-sci-gtoc-nir_A-tac.dat
/home/salomonsson/Master Project/git/carmenes/car-20170822T01h54m18s-sci-gtoc-nir_A-tac.dat
car-20170924T20h42m07s-sci-gtoc-nir_A-tac
In [9]:
display(test_data1.head(10))
0 1 2
0 9603.598508 NaN NaN
1 9603.651896 NaN NaN
2 9603.705279 NaN NaN
3 9603.758657 NaN NaN
4 9603.812029 NaN NaN
5 9603.865396 NaN NaN
6 9603.918758 NaN NaN
7 9603.972115 0.069754 0.002209
8 9604.025467 0.106114 0.002894
9 9604.078814 0.109209 0.002949
In [10]:
# Plot it
plt.figure(figsize=(15,8))
plt.plot(test_data1.iloc[:,0], test_data1.iloc[:,1], color='green')
plt.plot(test_data2.iloc[:,0], test_data2.iloc[:,1], color='red')
plt.plot(test_data3.iloc[:,0], test_data3.iloc[:,1], color='black')
plt.title("Carmenes NIR, order 1", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=15, weight=0)
plt.xlabel("Angstrom", fontsize=15, weight=0)
plt.legend([file_paths[a][-45:-4], file_paths[b][-45:-4], file_paths[c][-45:-4]], loc="upper right", fontsize=12)
plt.xticks(fontsize=12, weight=0)
plt.yticks(fontsize=12, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-0.01,1))
plt.show()

and now VIS

In [11]:
#Plot these:
aa = 4
bb = 6
cc = 7

test_data11 = pd.read_table(file_paths[aa], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data22 = pd.read_table(file_paths[bb], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data33 = pd.read_table(file_paths[cc], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
print(file_paths[aa])
print(file_paths[bb])
print(file_paths[cc][-45:-4])
/home/salomonsson/Master Project/git/carmenes/car-20170914T03h24m58s-sci-gtoc-vis_A-tac.dat
/home/salomonsson/Master Project/git/carmenes/car-20170912T02h41m57s-sci-gtoc-vis_A-tac.dat
car-20170822T01h54m18s-sci-gtoc-vis_A-tac
In [12]:
# Plot it
plt.figure(figsize=(15,8))
plt.plot(test_data11.iloc[:,0], test_data11.iloc[:,1], color='green')
plt.plot(test_data22.iloc[:,0], test_data22.iloc[:,1], color='red')
plt.plot(test_data33.iloc[:,0], test_data33.iloc[:,1], color='black')
plt.title("Carmenes VIS, order 1", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=15, weight=0)
plt.xlabel("Angstrom", fontsize=15, weight=0)
plt.legend([file_paths[a][-45:-4], file_paths[b][-45:-4], file_paths[c][-45:-4]], loc="lower left", fontsize=12)
plt.xticks(fontsize=12, weight=0)
plt.yticks(fontsize=12, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-1e7,1.3e7))
plt.show()
In [13]:
print(len(file_paths))
18
In [14]:
print(all_loaded_data[1][0].iloc[2,:])
Angstrom    9604.07881353
Flux             0.109209
Name: 10, dtype: object

Pre-process the Carmenes data

In [15]:
def seperate_data(data):
    # Create two lists, one with all the visual (vis) values and one for all the near infrared values (nir)
    all_loaded_data_VIS = []
    all_loaded_data_NIR = []

    for i in range(len(data)):
        if "vis" in str(data[i][0].index.names):
            all_loaded_data_VIS.append(copy.deepcopy(data[i]))
        elif "nir" in str(data[i][0].index.names):
            all_loaded_data_NIR.append(copy.deepcopy(data[i]))
        else:
            pass
    return all_loaded_data_VIS, all_loaded_data_NIR


all_loaded_data_VIS, all_loaded_data_NIR = seperate_data(all_loaded_data)
In [16]:
print(len(all_loaded_data_VIS))    
print(len(all_loaded_data_VIS[0][0]))
print(len(all_loaded_data_VIS[0]))
9
3093
61

Align the min and max values in each dataframe so each order start and ends at the same indeces.

That is, all order 0 start and ends at the same indeces, all order 1 start and ends at the same indeces, etc. etc. The chosen start index is the largest index among the start indeces and the end index is chosen as the smallest among the end indices.

In [17]:
"""Visual wavelength (VIS) data"""
for i in range(len(all_loaded_data_VIS[0])):
    all_min_VIS_values, all_max_VIS_values = [], []
    for j in range(len(all_loaded_data_VIS)):
        all_min_VIS_values.append(all_loaded_data_VIS[j][i].index[0])
        all_max_VIS_values.append(all_loaded_data_VIS[j][i].index[-1])
    VIS_min, VIS_max = max(all_min_VIS_values), min(all_max_VIS_values) # choose the min and max limits for each order
    
    # Align the values for all the VIS data (according to VIS_min and VIS_max) so they match each other.
    for h in range(len(all_loaded_data_VIS)): 
        all_loaded_data_VIS[h][i] = all_loaded_data_VIS[h][i].loc[VIS_min:VIS_max]
     
    
"""Near Infrared (NIR) data"""  
for k in range(len(all_loaded_data_NIR[0])):
    all_min_NIR_values, all_max_NIR_values = [], []
    for m in range(len(all_loaded_data_NIR)):
        all_min_NIR_values.append(all_loaded_data_NIR[m][k].index[0])
        all_max_NIR_values.append(all_loaded_data_NIR[m][k].index[-1]) 
    NIR_min, NIR_max = max(all_min_NIR_values), min(all_max_NIR_values) # choose the min and max limits for each order
    
    # Align the values for all the NIR data (according to NIR_min and NIR_max) so they match each other.
    for n in range(len(all_loaded_data_NIR)):
        all_loaded_data_NIR[n][k] = all_loaded_data_NIR[n][k].loc[NIR_min:NIR_max]
In [18]:
print(all_loaded_data_VIS[0][0].iloc[:10,])
                                                  Angstrom       Flux
car-20170913T21h52m13s-sci-gtoc-vis_Order_0                          
1066                                         5164.38938776   274836.0
1067                                         5164.41411195   -42418.2
1068                                         5164.43883341   483369.0
1069                                         5164.46355214    89297.5
1070                                         5164.48826813   133463.0
1071                                          5164.5129814  -111887.0
1072                                         5164.53769193  -279716.0
1073                                         5164.56239972  -158934.0
1074                                         5164.58710478   -2828.23
1075                                          5164.6118071    99728.4
In [19]:
print(all_loaded_data_NIR[0][0].iloc[:10,])
                                                  Angstrom       Flux
car-20170609T20h33m03s-sci-gtoc-nir_Order_0                          
8                                            9603.97211533  0.0697542
9                                            9604.02546703   0.106114
10                                           9604.07881353   0.109209
11                                           9604.13215483   0.116303
12                                           9604.18549094   0.124799
13                                           9604.23882186   0.136005
14                                           9604.29214757   0.137569
15                                           9604.34546809   0.137595
16                                           9604.39878341   0.140179
17                                           9604.45209353   0.139836
In [20]:
print(all_loaded_data_NIR[1][0].iloc[:10,])
                                                  Angstrom      Flux
car-20170822T01h54m18s-sci-gtoc-nir_Order_0                         
8                                            9603.97966667  0.282435
9                                            9604.03301111  0.295427
10                                           9604.08635042  0.290871
11                                           9604.13968459  0.288706
12                                           9604.19301361  0.292839
13                                            9604.2463375  0.290136
14                                           9604.29965624  0.288772
15                                           9604.35296985  0.294931
16                                           9604.40627832  0.290882
17                                           9604.45958165  0.293751

Find the averages for each Angstrom value and replace the old values with the averages. Carmenes data

(The run will take some 20 min in total)

In [21]:
# Create new lists for the average values (a copy of the normal ones).
all_loaded_data_VIS_average = copy.deepcopy(all_loaded_data_VIS)
all_loaded_data_NIR_average = copy.deepcopy(all_loaded_data_NIR)
In [22]:
import re 
from itertools import groupby
# Define some functions

def create_file_path(name):
    """ Create file path to store files"""
    if "Darwin" in os.uname():  # if it's a Mac
        base = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
    else:
        base = "/home/salomonsson/Master Project/git/carmenes_normalised/"
    return(base + name + ".dat")

def sorted_nicely(l): 
    """ Sort the list in a smarter way.""" 
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return l.sort(key = alphanum_key)
# https://stackoverflow.com/questions/2669059/how-to-sort-alpha-numeric-set-in-python

def sorted_nicely_short(s):
    return [int(''.join(g)) if k else ''.join(g) for k, g in groupby('\0'+s, str.isdigit)]
# https://stackoverflow.com/questions/2669059/how-to-sort-alpha-numeric-set-in-python

VIS data

In [23]:
# Choose directory depending on the OS I'm running
if "Darwin" in os.uname():
    d = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
else:
    d = "/home/salomonsson/Master Project/git/carmenes_normalised/"
    
quantity1 = len(all_loaded_data_VIS_average) * len(all_loaded_data_VIS_average[0])   
        
for dirpath, dirnames, files in os.walk(d):   # Get the amount of VIS files in the directory d
    vis_files = []
    for i in files:
        if "vis" in i:
            vis_files.append(i) 

    if len(vis_files) != quantity1:           # if all normalised VIS files arent in d, normalise all_loaded_data_VIS
        start1 = time.time()
        
        """Visual wavelength (VIS) data"""
        for a in range(len(all_loaded_data_VIS_average[0])):
            for i in range(len(all_loaded_data_VIS_average[0][0])):
                all_VIS_values = []
                for j in range(len(all_loaded_data_VIS_average)):
                    all_VIS_values.append(float(all_loaded_data_VIS_average[j][a].iloc[i, 0]))
                VIS_average = np.mean(all_VIS_values)                                           # Get the average
    
                # Replace the old values with the VIS_average.
                for h in range(len(all_loaded_data_VIS_average)): 
                    all_loaded_data_VIS_average[h][a].iloc[i, 0] = VIS_average

        # Check how long time it took.
        end1 = time.time()
        tt1 = end1 - start1
        print("Total run time: {0:0.0f} min and {1:0.0f} s ".format((tt1-tt1%60)/60, tt1%60))
        
        # Make sure all values are float values
        for i in range(len(all_loaded_data_VIS_average)):
            for j in range(len(all_loaded_data_VIS_average[i])):
                all_loaded_data_VIS_average[i][j].iloc[:,0]=list(map(float, all_loaded_data_VIS_average[i][j].iloc[:,0]))
                all_loaded_data_VIS_average[i][j].iloc[:,1]=list(map(float, all_loaded_data_VIS_average[i][j].iloc[:,1]))
        
        # Save the normalised VIS data in separate files
        for i in range(len(all_loaded_data_VIS_average)):
            for j in range(len(all_loaded_data_VIS_average[0])):
                name = all_loaded_data_VIS_average[i][j].index.name
                all_loaded_data_VIS_average[i][j].to_csv(create_file_path(name), header=False, index=True)
    
    else:
        print("Normalised VIS files already exists, download them instead to save time.")


# Get the file names and save them to a list
vis_files = []
for file_name in files:
    if "vis" in file_name:
        file_name = file_name[:-4]     # Remove the .dat notation in the name
        vis_files.append(file_name)
Normalised VIS files already exists, download them instead to save time.

NIR data

In [24]:
quantity2 = len(all_loaded_data_NIR_average) * len(all_loaded_data_NIR_average[0])
    
for dirpath, dirnames, files in os.walk(d):   
    nir_files = []
    for i in files:
        if "nir" in i:
            nir_files.append(i)
            
    if len(nir_files) != quantity2:           # if not all normalised NIR files are in d, normalise all_loaded_data_NIR
        start2 = time.time()
    
        """Near Infrared (NIR) data"""  
        for b in range(len(all_loaded_data_NIR_average[0])):
            for k in range(len(all_loaded_data_NIR_average[0][0])):
                all_NIR_values = []
                for m in range(len(all_loaded_data_NIR_average)):
                    all_NIR_values.append(float(all_loaded_data_NIR_average[m][b].iloc[k, 0]))
                NIR_average = np.mean(all_NIR_values)                                           # Get the average        
    
                # Replace the old values with the NIR_average.
                for n in range(len(all_loaded_data_NIR_average)):
                    all_loaded_data_NIR_average[n][b].iloc[k, 0] = NIR_average
            
        # Check how long time it took.
        end2 = time.time()
        tt2 = end2 - start2
        print("Total run time: {0:0.0f} min and {1:0.0f} s ".format((tt2-tt2%60)/60, tt2%60))

        # Make sure all values are float values
        for i in range(len(all_loaded_data_NIR_average)):
            for j in range(len(all_loaded_data_NIR_average[i])):
                all_loaded_data_NIR_average[i][j].iloc[:,0]=list(map(float, all_loaded_data_NIR_average[i][j].iloc[:,0]))
                all_loaded_data_NIR_average[i][j].iloc[:,1]=list(map(float, all_loaded_data_NIR_average[i][j].iloc[:,1]))
        
        # Save the normalised NIR data in separate files
        for x in range(len(all_loaded_data_NIR_average)):
            for y in range(len(all_loaded_data_NIR_average[0])):
                name = all_loaded_data_NIR_average[x][y].index.name
                all_loaded_data_NIR_average[x][y].to_csv(create_file_path(name), header=False, index=True)
        
    else:
        print("All the NIR files are already normalised, download them instead.")
        
        
# Get the file names and save them to a list
nir_files = []
for file_name in files:
    if "nir" in file_name:
        file_name = file_name[:-4]     # Remove the .dat notation in the name
        nir_files.append(file_name)
All the NIR files are already normalised, download them instead.

Download the data

In [25]:
# Sort the lists
vis_files = sorted(vis_files, key=sorted_nicely_short)
nir_files = sorted(nir_files, key=sorted_nicely_short)
files = sorted(files, key=sorted_nicely_short)

#sorted_nicely(vis_files)
#sorted_nicely(nir_files)
#sorted_nicely(files)

# Get the directory for each .dat file and store them in file_paths
if "Darwin" in os.uname():
    d = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
else:
    d = "/home/salomonsson/Master Project/git/carmenes_normalised/"
    
file_paths = glob(d + "*.dat") 
file_paths = sorted(file_paths, key=sorted_nicely_short)

# Save NIR and VIS in separate lists
file_paths_NIR, file_paths_VIS = [], []
for path in file_paths:
    if "nir" in path:
        file_paths_NIR.append(path)
    elif "vis" in path:
        file_paths_VIS.append(path)
In [26]:
def combine_Carmenes_Orders(folder, NIR_or_VIS):
    # A function to combine and plot the carmenes data with all orders included
    if NIR_or_VIS.upper() == "NIR":
        tp = "/NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "/VIS/"
    else:
        print("Something is not correct")
    
    if "Darwin" in os.uname():
        d = "/Users/jakob/Desktop/Master Project/git/Carmenes_norm_plots/" + folder + tp
    else:
        d = "/home/salomonsson/Master Project/git/Carmenes_norm_plots/" + folder + tp
    
    file_paths_Carm = glob(d + "*.dat") 
    file_paths_Carm = sorted(file_paths_Carm, key=sorted_nicely_short)
    list_of_df = []
    for path in file_paths_Carm:
        data = pd.read_table(path, sep=',', header=None)
        list_of_df.append(data)
    final_data = pd.concat(list_of_df)
    final_data.index.names = [str(path)[70:-13]]   # add name
    final_data.columns = ["Index", "Angstrom", "Flux"]
    return final_data

NIR_data1 = combine_Carmenes_Orders("plot1", "NIR")    
NIR_data2 = combine_Carmenes_Orders("plot2", "NIR") 
NIR_data3 = combine_Carmenes_Orders("plot3", "NIR") 

VIS_data1 = combine_Carmenes_Orders("plot1", "VIS")
VIS_data2 = combine_Carmenes_Orders("plot2", "VIS") 
VIS_data3 = combine_Carmenes_Orders("plot3", "VIS")
In [27]:
# Plot it
plt.figure(figsize=(15,8))
plt.plot(NIR_data3.iloc[:,1], NIR_data3.iloc[:,2], color='green')
plt.plot(NIR_data1.iloc[:,1], NIR_data1.iloc[:,2], color='red')
plt.plot(NIR_data2.iloc[:,1], NIR_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes NIR", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(NIR_data3.index.names)[2:-2], 
            str(NIR_data1.index.names)[2:-2], 
            str(NIR_data2.index.names)[2:-2]], loc="upper right", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-0.01,1))
plt.show()
In [28]:
# Plot it
plt.figure(figsize=(15,8))
plt.plot(VIS_data3.iloc[:,1], VIS_data3.iloc[:,2], color='green')
plt.plot(VIS_data1.iloc[:,1], VIS_data1.iloc[:,2], color='red')
plt.plot(VIS_data2.iloc[:,1], VIS_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes VIS", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(VIS_data3.index.names)[2:-2], 
            str(VIS_data1.index.names)[2:-2], 
            str(VIS_data2.index.names)[2:-2]], loc="lower left", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-1e7,1.3e7))
plt.show()
In [29]:
print(len(files))
#print(files)
print(len(file_paths))
print(len(nir_files))
print(len(vis_files))
801
801
252
549
In [30]:
# Load the data from the .dat files

# VIS
nbr = 0
for i in range(len(all_loaded_data_VIS_average)):
    for j in range(len(all_loaded_data_VIS_average[i])):
        all_loaded_data_VIS_average[i][j] = pd.read_csv(file_paths_VIS[nbr], names=["Angstrom", "Flux"])   
        all_loaded_data_VIS_average[i][j].index.names = [vis_files[nbr]]
        nbr += 1
        
# NIR
nbr = 0
for i in range(len(all_loaded_data_NIR_average)):
    for j in range(len(all_loaded_data_NIR_average[i])):
        all_loaded_data_NIR_average[i][j] = pd.read_csv(file_paths_NIR[nbr], names=["Angstrom", "Flux"])   
        all_loaded_data_NIR_average[i][j].index.names = [nir_files[nbr]]
        nbr += 1

# VIS + NIR (this will be easier to use in future pre-processing steps)
all_loaded_data_average = []
nbr = 0
for i in range(len(all_loaded_data_VIS_average)):
    for j in range(len(all_loaded_data_VIS_average[i])):
        data_comb = pd.read_csv(file_paths[nbr], names=["Angstrom", "Flux"])   
        data_comb.index.names = [files[nbr]]
        all_loaded_data_average.append(data_comb)
        nbr += 1 
for i in range(len(all_loaded_data_NIR_average)):
    for j in range(len(all_loaded_data_NIR_average[i])):
        data_comb = pd.read_csv(file_paths[nbr], names=["Angstrom", "Flux"])   
        data_comb.index.names = [files[nbr]]
        all_loaded_data_average.append(data_comb)
        nbr += 1        
In [31]:
#print(all_loaded_data_NIR_average[0][0].iloc[0,:])
L =  []
L.append(all_loaded_data_NIR_average[0][0].iloc[0,:])
L.append(all_loaded_data_NIR_average[0][0].iloc[1,:])
L.append(all_loaded_data_NIR_average[0][0].iloc[2,:])

df = pd.DataFrame(L)

#print(L)
display(df)
#temp = all_loaded_data_NIR_average[0][0].iloc[0,:]
#temp["Flux"] = 23
#print(temp)
Angstrom Flux
8 9603.977409 0.307987
9 9604.030755 0.318445
10 9604.084096 0.321639

Convert the Flux values from string to float.

In [32]:
# For Visual data
for i in range(len(all_loaded_data_VIS_average)):
    for j in range(len(all_loaded_data_VIS_average[i])):
        all_loaded_data_VIS_average[i][j].iloc[:,1] = list(map(float, all_loaded_data_VIS_average[i][j].iloc[:,1]))

# For Near Infrared data
for i in range(len(all_loaded_data_NIR_average)):
    for j in range(len(all_loaded_data_NIR_average[i])):
        all_loaded_data_NIR_average[i][j].iloc[:,1] = list(map(float, all_loaded_data_NIR_average[i][j].iloc[:,1]))

# For the combined (VIS + NIR) data
for j in range(len(all_loaded_data_average)):
    all_loaded_data_average[j].iloc[:,0] = list(map(float, all_loaded_data_average[j].iloc[:,0]))
    all_loaded_data_average[j].iloc[:,1] = list(map(float, all_loaded_data_average[j].iloc[:,1]))
In [33]:
print(len(all_loaded_data_average))
#print(all_loaded_data_average[100])
print(type(all_loaded_data_VIS_average[0][0].iloc[0,0]))
801
<class 'numpy.float64'>
In [34]:
print(all_loaded_data_VIS_average[0][0].iloc[:10,])
print(all_loaded_data_VIS_average[1][0].iloc[:10,])
print(all_loaded_data_NIR_average[0][0].iloc[:10,])
print(all_loaded_data_NIR_average[1][0].iloc[:10,])
                                                Angstrom      Flux
car-20170520T20h38m14s-sci-gtoc-vis_Order_0                       
1066                                         5164.389835  595812.0
1067                                         5164.414559  261148.0
1068                                         5164.439280   76562.7
1069                                         5164.463998  408231.0
1070                                         5164.488713  770221.0
1071                                         5164.513426  526151.0
1072                                         5164.538136  753542.0
1073                                         5164.562843  841066.0
1074                                         5164.587548  412286.0
1075                                         5164.612250  538195.0
                                                Angstrom      Flux
car-20170609T20h33m03s-sci-gtoc-vis_Order_0                       
1066                                         5164.389835  557447.0
1067                                         5164.414559 -371674.0
1068                                         5164.439280  148020.0
1069                                         5164.463998 -162322.0
1070                                         5164.488713  220413.0
1071                                         5164.513426  804392.0
1072                                         5164.538136 -331859.0
1073                                         5164.562843  632605.0
1074                                         5164.587548  154854.0
1075                                         5164.612250 -149067.0
                                                Angstrom      Flux
car-20170520T20h38m14s-sci-gtoc-nir_Order_0                       
8                                            9603.977409  0.307987
9                                            9604.030755  0.318445
10                                           9604.084096  0.321639
11                                           9604.137431  0.321562
12                                           9604.190762  0.321675
13                                           9604.244087  0.324564
14                                           9604.297408  0.328767
15                                           9604.350723  0.328893
16                                           9604.404033  0.326675
17                                           9604.457338  0.340958
                                                Angstrom      Flux
car-20170609T20h33m03s-sci-gtoc-nir_Order_0                       
8                                            9603.977409  0.069754
9                                            9604.030755  0.106114
10                                           9604.084096  0.109209
11                                           9604.137431  0.116303
12                                           9604.190762  0.124799
13                                           9604.244087  0.136005
14                                           9604.297408  0.137569
15                                           9604.350723  0.137595
16                                           9604.404033  0.140179
17                                           9604.457338  0.139836

Visualise some of the pre-processed Carmenes dataset

alpha is neglected due to its small influence. Its also partly included in the metallicity, and as such, will make a very small difference when neglected.

In [35]:
x1 = all_loaded_data_VIS_average[0][0].iloc[:, 0]
y1 = all_loaded_data_VIS_average[0][0].iloc[:, 1]
x2 = all_loaded_data_VIS_average[1][0].iloc[:, 0]
y2 = all_loaded_data_VIS_average[1][0].iloc[:, 1]
x3 = all_loaded_data_VIS_average[2][0].iloc[:, 0]
y3 = all_loaded_data_VIS_average[2][0].iloc[:, 1]

plt.figure(figsize=(15,8))
plt.plot(x1.values, y1.values)
plt.plot(x2.values, y2.values)
plt.plot(x3.values, y3.values)
plt.title("Carmenes VIS", fontsize=20)
plt.ylabel("Flux", fontsize=18)
plt.xlabel("Angstrom", fontsize=18)
plt.legend([x1.index.names[0], x2.index.names[0], x2.index.names[0]], loc="upper left", fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
In [36]:
xx1 = all_loaded_data_NIR_average[0][0].iloc[:, 0]
yy1 = all_loaded_data_NIR_average[0][0].iloc[:, 1]
xx2 = all_loaded_data_NIR_average[1][0].iloc[:, 0]
yy2 = all_loaded_data_NIR_average[1][0].iloc[:, 1]
xx3 = all_loaded_data_NIR_average[2][0].iloc[:, 0]
yy3 = all_loaded_data_NIR_average[2][0].iloc[:, 1]

plt.figure(figsize=(15,8))
plt.plot(xx1.values, yy1.values)
plt.plot(xx2.values, yy2.values)
plt.plot(xx3.values, yy3.values)
plt.title("Carmenes NIR", fontsize=20)
plt.ylabel("Flux", fontsize=18)
plt.xlabel("Angstrom", fontsize=18)
plt.legend([xx1.index.names[0], xx2.index.names[0], xx2.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()

#(The big step (hole) below is just one step)
In [37]:
NIR_data1 = combine_Carmenes_Orders("plot1", "NIR")    
NIR_data2 = combine_Carmenes_Orders("plot2", "NIR") 
NIR_data3 = combine_Carmenes_Orders("plot3", "NIR") 

VIS_data1 = combine_Carmenes_Orders("plot1", "VIS")
VIS_data2 = combine_Carmenes_Orders("plot2", "VIS") 
VIS_data3 = combine_Carmenes_Orders("plot3", "VIS")
In [38]:
# Plot it
plt.figure(figsize=(15,8))
plt.plot(NIR_data3.iloc[:,1], NIR_data3.iloc[:,2], color='green')
plt.plot(NIR_data1.iloc[:,1], NIR_data1.iloc[:,2], color='red')
plt.plot(NIR_data2.iloc[:,1], NIR_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes NIR", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(NIR_data3.index.names)[2:-2], 
            str(NIR_data1.index.names)[2:-2], 
            str(NIR_data2.index.names)[2:-2]], loc="upper right", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-0.01,1))
plt.show()
In [39]:
# Plot it
plt.figure(figsize=(15,8))
plt.plot(VIS_data3.iloc[:,1], VIS_data3.iloc[:,2], color='green')
plt.plot(VIS_data1.iloc[:,1], VIS_data1.iloc[:,2], color='red')
plt.plot(VIS_data2.iloc[:,1], VIS_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes VIS", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(VIS_data3.index.names)[2:-2], 
            str(VIS_data1.index.names)[2:-2], 
            str(VIS_data2.index.names)[2:-2]], loc="lower left", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-1e7,1.3e7))
plt.show()

BT-Settl data set

Import the data from the .dat files

In [40]:
def get_btsettl_info(NIR_or_VIS):
    # Get the directory for each .dat file and store them in file_paths
    if NIR_or_VIS.upper() == "NIR":
        f = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        f = "VIS/"
    else:
        print("Some kind of in(out)put error has occured")
        
    if "Darwin" in os.uname():
        dir_settl = "/Users/jakob/Desktop/Master Project/bt-settl (new version)/" + f
    else:
        dir_settl = "/opt/bt-settl/"
    
    paths_settl = glob(dir_settl + "*.txt")     # all the sub directories and all the .dat files       
    
    # Get the .dat file names
    file_names_settl = []
    for root, dirs, files_bt in os.walk(dir_settl):
        for filename in files_bt:
            if filename.endswith(".txt"):
                f = filename.split(".BT")[0]            # Remove unneccessary "common" information from the name  
                file_names_settl.append(f)
    file_names_settl = sorted(file_names_settl, key=sorted_nicely_short)
    paths_settl = sorted(paths_settl, key=sorted_nicely_short)
    return file_names_settl, paths_settl

file_names_settl, paths_settl = get_btsettl_info('NIR')
In [41]:
print(len(paths_settl))
4046
In [42]:
def get_btsettl_data(position):
    """ Returns the btsettl dataframe at position "position". """
    data = pd.read_table(paths_settl[position], sep='\s+', header=None, skiprows=8)
    data.index.names = [file_names_settl[position]]               # Add the file name to the dataframe
    data.dropna(inplace=True)                                     # Remove NaN-values
    data.columns = ["Angstrom", "Flux"]
    return data


def create_btsettl_path(name):
    """ Create file path to store files"""
    if "Jakobs-MBP.home" in os.uname():
        base = "/Users/jakob/Desktop/Master Project/bt-settl_normalised/"
    else:
        base = "/home/salomonsson/Master Project/bt-settl/"
    return(base + name + ".dat")


def create_btsettl_path_CLEAN(name):
    """ Create file path to store files"""
    if "Jakobs-MBP.home" in os.uname():
        base = "/Users/jakob/Desktop/Master Project/bt-settl-CLEAN/"
    else:
        base = "/home/salomonsson/Master Project/bt-settl-CLEAN/"
    return(base + name + ".dat")


def get_btsettl_data_byName(name):
    """ Returns the btsettl dataframe with the name "name". """
    for path in paths_settl:
        if name in path:
            data = pd.read_table(path, sep='\s+', header=None, skiprows=8)
            data.index.names = [name]                                     # Add the file name to the dataframe
            data.dropna(inplace=True)                                     # Remove NaN-values (if any)
            data.columns = ["Angstrom", "Flux"]
    return data    


## https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array  # + added functionality
def find_nearest(array,value):
    '''Given an ``array`` , and given a ``value`` , returns a value j such that ``value`` is between array[j]
    and array[j+1]. ``array`` must be monotonic increasing. j=-1 or j=len(array) is returned
    to indicate that ``value`` is out of range below and above respectively.'''
    n = len(array)
    if (value < array[0]):
        return -1
    elif (value > array[n-1]):
        return n
    jl = 0                                                   # Initialize lower
    ju = n-1                                                 # and upper limits.
    while (ju-jl > 1):                                       # If we are not yet done,
        jm=(ju+jl) >> 1                                      # compute a midpoint with a bitshift
        if (value >= array[jm]):
            jl=jm                                            # and replace either the lower limit
        else:
            ju=jm                                            # or the upper limit, as appropriate.
        # Repeat until the test condition is satisfied.
    up_dif = np.abs(value-array[jl-1])
    down_dif = np.abs(value-array[jl+1])
    if (value == array[0]):                                  # edge cases at bottom
        return 0
    elif (value == array[n-1]):                              # and top
        return n-1
    elif up_dif > down_dif:
        return array[jl+1]
    elif up_dif < down_dif:
        return array[jl-1]
    else:
        return array[jl]

Visualise the Raw BT-Settl data

In [43]:
xx1 = get_btsettl_data_byName(file_names_settl[0])
yy1 = get_btsettl_data_byName(file_names_settl[0])
xx2 = get_btsettl_data_byName(file_names_settl[1])
yy2 = get_btsettl_data_byName(file_names_settl[1])
xx3 = get_btsettl_data_byName(file_names_settl[2])
yy3 = get_btsettl_data_byName(file_names_settl[2])
In [44]:
plt.figure(figsize=(15,8))
plt.plot(xx1.iloc[:, 0].values, yy1.iloc[:, 1].values, color='black')
plt.plot(xx2.iloc[:, 0].values, yy2.iloc[:, 1].values, color='red')
plt.plot(xx3.iloc[:, 0].values, yy3.iloc[:, 1].values, color='green')
plt.title("Raw BT-Settl, Full Spectra", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([xx1.index.names[0], xx2.index.names[0], xx3.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((0,0.001))
plt.show()
In [45]:
print(len(all_loaded_data_VIS_average[0][0]))
print(len(all_loaded_data_VIS_average[0]))
print(len(all_loaded_data_NIR_average[0][0]))
print(len(all_loaded_data_NIR_average[0]))

""" Displaying that Angstrom values are monotonic increasing ==> by doing this, 'find_nearest' can be used """
for j in range(len(all_loaded_data_VIS_average[0])):
    for i in range(len(all_loaded_data_VIS_average[0][j])-1):
        if all_loaded_data_VIS_average[0][j].values[i,0] <= all_loaded_data_VIS_average[0][j].values[i+1,0]:
            pass
        else:
            print("NOT ok VIS")

for j in range(len(all_loaded_data_NIR_average[0])):
    for i in range(len(all_loaded_data_NIR_average[0][j])-1):
        if all_loaded_data_NIR_average[0][j].values[i,0] <= all_loaded_data_NIR_average[0][j].values[i+1,0]:
            pass
        else:
            print("NOT ok NIR")
            
print("Everythings fine")
2871
61
4073
28
Everythings fine
In [46]:
print(len(file_names_settl))
4046
In [47]:
# Make some local copies
data_VIS_avg_LOCAL = copy.deepcopy(all_loaded_data_VIS_average)
data_NIR_avg_LOCAL = copy.deepcopy(all_loaded_data_NIR_average)
In [48]:
len(data_NIR_avg_LOCAL[0])
Out[48]:
28
In [49]:
def create_btsettl_data_FINAL(carmenes_data, file_names, output):
    """ Convolution of the BT-Settl data set.
       'file_names' is a list of BT-Settl data file names, output is a string, either "NIR" or "VIS". """
    if output.upper() == "NIR":
        light = "_NIR"
    elif output.upper() == "VIS":
        light = "_VIS"
    else:
        print("Some kind of in(out)put error has occured")
    
    nv, R = 50, 82000 
    for name in file_names: 
        order = 0
        data = get_btsettl_data_byName(name)
        name = name + light
        for m in range(len(carmenes_data[0])): 
            small_list = []
            for n in range(len(carmenes_data[0][0])):
                """Perform the necessary calculations"""
                
                # Get a wavelength value from the carmenes dataset, 
                # store it in the variable 'look_here'
                look_here = float(copy.deepcopy(carmenes_data[0][m].values[n,0]))
                
                # Select the BT-Settl wavelengths
                ar = data['Angstrom'].values
                
                # Get the n’th wavelength value from the carmenes dataset
                # that is closest to the 'look_here' variable
                use_this = find_nearest(ar,look_here)            
                
                # Get the corresponding index
                itemindex = np.where(ar==use_this)[0]
                
                # Convert to integers and floats
                ind = int(itemindex[int(0)])
                use_this = float(use_this)
                
                # Convolution intervals
                xsg1 = ind - nv
                xsg2 = ind + nv
                
                # Apply these intervals to the BT-Settl data
                xsgA = data["Angstrom"].iloc[xsg1:(xsg2+1)]
                xsgF = data["Flux"].iloc[(xsg1-1):xsg2]
                
                # Calculate the impact from the chosen BT-Settl flux values
                xt = copy.deepcopy(use_this)
                sgm = float(use_this/(R*2*np.sqrt(2.*np.log(2.))))
                flt = np.exp(-(xsgA - xt)**2/(2*sgm**2))/(np.sqrt(2*np.pi)*sgm) 
                
                # Calculate the sum of this impact, reverse xsgF before multiplying
                xsgA2 = data["Angstrom"].iloc[(xsg1-1):(xsg2+1)]
                the_sum = np.sum(np.diff(xsgA2)*flt*xsgF[::-1])
                
                # Add the newly calculated flux to the Carmenes data on the appropriate position
                temp = copy.deepcopy(carmenes_data[0][m].iloc[n,:])
                temp["Flux"] = the_sum
                small_list.append(temp)
            
            # Add some space
            temp_list = []
            temp_list.append(' ')
            temp_list.append(' ')
            temp_list.append("#order: {0}".format(order))
            pd.DataFrame(temp_list).to_csv(create_btsettl_path(name), 
                                           header=False, index=False, mode="a")
            order += 1
            
            # Create the dataframe
            df = pd.DataFrame(small_list)
            
            # Convert the df to numerical values
            df = df.apply(pd.to_numeric, errors='ignore')  
            
            # Calculate the Rolling mean for the Flux and equal the "area below" to 1.
            temp_df = (df["Flux"].rolling(2).mean()[1:]*(np.diff(df["Angstrom"])))
            df["Flux"] = df["Flux"]/temp_df.sum()
            
            # Write to file
            df.to_csv(create_btsettl_path(name), header=False, index=False, mode="a")
In [50]:
######################%timeit create_btsettl_data_FINAL(data_NIR_avg_LOCAL, file_names_settl[1:2], "NIR")
In [51]:
len(file_names_settl)
Out[51]:
4046
In [52]:
from multiprocessing import Process

start1 = time.time()

#if __name__ == '__main__':
#    try:
#        mp.set_start_method('fork')
#    except RuntimeError:
#        pass

    # Number of cores to use
    number_of_cores = 15
    files_per_core = 7
    
    # Setup a list of processes that we want to run   
    # ITS IMPORTANT THAT file_names_settl IS DEFINED WITHIN A RANGE RATHER THAN ONE SPECIFIC POSITION
    file_nbr = 3000
    for _ in range(files_per_core):
        processes = []
        for _ in range(number_of_cores):
            prr = Process(target=create_btsettl_data_FINAL, 
                          args=(data_NIR_avg_LOCAL, 
                                file_names_settl[file_nbr:(file_nbr+1)], "NIR"))
            processes.append(prr)
            file_nbr += 1
    
        # Run processes
        for p in processes:
            p.start()

        # Exit the completed processes
        for p in processes:
            p.join()

# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
  File "<ipython-input-52-98d8a0311e4e>", line 12
    number_of_cores = 15
    ^
IndentationError: unexpected indent
In [ ]:
from multiprocessing import Process

# Time the process
start1 = time.time()

#if __name__ == '__main__':

#    try:
#        mp.set_start_method('fork')
#    except RuntimeError:
#        pass

    number_of_cores = 15
    files_per_core = 4
    
    # Setup a list of processes that we want to run   
    file_nbr = 3000
    for _ in range(files_per_core):
        processes = []
        for _ in range(number_of_cores):
            prr = Process(target=create_btsettl_data_FINAL, 
                          args=(data_VIS_avg_LOCAL, 
                                file_names_settl[file_nbr:(file_nbr+1)], "VIS"))
            processes.append(prr)
            file_nbr += 1
    
        # Run processes
        for p in processes:
            p.start()

        # Exit the completed processes
        for p in processes:
            p.join()

# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))

Plot some results

NIR (new FASTER method)

In [53]:
def get_final_fileInfo122(NIR_or_VIS):
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print("Something is not correct")
    
    # Get the directory for each .dat file and store them in file_paths
    # Choose directory depending on the OS I'm running (similar code is found through the entire project)
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/bt-settl/" + tp
    else:
        directory = "/home/salomonsson/Master Project/bt-settl-PLOT/" + tp
    
    file_paths = glob(directory + "*.dat")            
    #print(file_paths)
    
    # Get the .dat file names
    file_names = []
    for root, dirs, files in os.walk(directory):                                 
        for filename in files:
            if filename.endswith(".dat"):
                filename = filename[:-10]                     # Remove unneccessary "common" information from the name
                file_names.append(filename)
    return file_names, file_paths

NIR_new_final_file_names_NEW, NIR_new_final_file_paths_NEW = get_final_fileInfo122("NIR")


print("This many NIR files were loaded: {0}".format(len(NIR_new_final_file_names_NEW)))
This many NIR files were loaded: 16
In [54]:
def load_FINAL_data122(paths, names):
    """ Load "this_many" files of the final pre-processed data"""
    all_loaded_data = []                                             # Create a list to store the data. 
    all_loaded_data_NAMES = []
    for i in range(len(paths)):
        data = pd.read_table(paths[i], header=None, skip_blank_lines=True, comment='#', skiprows=3, sep=",")
        data.index.names = [names[i]]                     # Add the file name to the dataframe
        data.dropna(inplace=True)                                    # Remove NaN-values (if any)
        
        NoV = paths[i][-7:-4]
        data.index.names = [str(data.index.names)[2:-2] + "_" + str(NoV)]       # add NIR/VIS to name
        data.columns = ["Angstrom", "Flux"]
        all_loaded_data.append(data)
        all_loaded_data_NAMES.append(data.index.names)
    return all_loaded_data, all_loaded_data_NAMES


paths = NIR_new_final_file_paths_NEW
names = NIR_new_final_file_names_NEW    
NIR_new_loaded_FINAL_data_NEW, NIR_new_loaded_final_NAMES_NEW = load_FINAL_data122(paths, names)
In [55]:
NIR_new_final_file_names_NEW, NIR_new_final_file_paths_NEW = get_final_fileInfo122("NIR")
In [56]:
pathsNIR = NIR_new_final_file_paths_NEW
namesNIR = NIR_new_final_file_names_NEW    
NIR_new_loaded_FINAL_data_NEW, NIR_new_loaded_final_NAMES_NEW = load_FINAL_data122(pathsNIR, namesNIR)
In [57]:
for i in range(len(NIR_new_final_file_names_NEW)):
    original_data = get_btsettl_data_byName(NIR_new_final_file_names_NEW[i])
    print(original_data.iloc[:1,:])

    xx_orig = original_data.iloc[250000:600000, 0].values
    yy_orig = original_data.iloc[250000:600000, 1].values
    plt.figure(figsize=(15,8))
    plt.plot(xx_orig, yy_orig, color='green')
    plt.title("Original Un-processed Data, NIR", fontsize=20, weight=0)
    plt.xlim(xx_orig[0], xx_orig[-1])
    plt.autoscale(enable=True, axis="x", tight=True)
    loc = matplotlib.ticker.LinearLocator(numticks=5)
    plt.gca().xaxis.set_major_locator(loc)
    plt.xticks(fontsize=15, weight=0)
    plt.yticks(fontsize=15, weight=0)
    plt.show()
    
    print(NIR_new_loaded_FINAL_data_NEW[i].iloc[:1,:])
    xx = NIR_new_loaded_FINAL_data_NEW[i].iloc[:, 0].values
    yy = NIR_new_loaded_FINAL_data_NEW[i].iloc[:, 1].values

    plt.figure(figsize=(15,8))
    plt.plot(xx, yy, color='green')
    plt.autoscale(enable=True, axis="x", tight=True)
    plt.title("Pre-processed Data, NIR", fontsize=20, weight=0)
    plt.xticks(fontsize=15, weight=0)
    plt.yticks(fontsize=15, weight=0)
    plt.show()
    print("=============================================================================================================")
    print("=============================================== STARTING NEW ================================================")
    print("=============================================================================================================")
    print()
    print()
    print()
    
                     Angstrom          Flux
lte012.0-4.5-0.0a+0                        
0                         0.0  1.856950e-97
                            Angstrom      Flux
lte012.0-4.5-0.0a+0_NIR                       
0                        9603.977409  0.004469
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-2.5-0.0a+0                        
0                         0.0  5.786290e-98
                            Angstrom      Flux
lte013.0-2.5-0.0a+0_NIR                       
0                        9603.977409  0.003178
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-4.5-0.0a+0                        
0                         0.0  1.856950e-97
                            Angstrom      Flux
lte013.0-4.5-0.0a+0_NIR                       
0                        9603.977409  0.004817
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-3.0-0.0a+0                        
0                         0.0  8.018630e-98
                            Angstrom      Flux
lte013.0-3.0-0.0a+0_NIR                       
0                        9603.977409  0.002754
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-2.5-0.0a+0                        
0                         0.0  5.804970e-98
                            Angstrom      Flux
lte012.0-2.5-0.0a+0_NIR                       
0                        9603.977409  0.002287
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-4.0-0.0a+0                        
0                         0.0  1.386440e-97
                            Angstrom      Flux
lte012.0-4.0-0.0a+0_NIR                       
0                        9603.977409  0.003734
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-3.5-0.0a+0                        
0                         0.0  1.067580e-97
                            Angstrom      Flux
lte012.0-3.5-0.0a+0_NIR                       
0                        9603.977409  0.003012
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-3.0-0.0a+0                        
0                         0.0  8.040810e-98
                            Angstrom      Flux
lte012.0-3.0-0.0a+0_NIR                       
0                        9603.977409  0.002638
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte014.0-3.0-0.0a+0                        
0                         0.0  7.985460e-98
                            Angstrom      Flux
lte014.0-3.0-0.0a+0_NIR                       
0                        9603.977409  0.002887
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-5.0-0.0a+0                        
0                         0.0  2.588210e-97
                            Angstrom      Flux
lte013.0-5.0-0.0a+0_NIR                       
0                        9603.977409  0.004919
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-5.5-0.0a+0                        
0                         0.0  3.850350e-97
                            Angstrom     Flux
lte013.0-5.5-0.0a+0_NIR                      
0                        9603.977409  0.00481
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-4.0-0.0a+0                        
0                         0.0  1.386440e-97
                            Angstrom      Flux
lte013.0-4.0-0.0a+0_NIR                       
0                        9603.977409  0.004201
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-5.0-0.0a+0                        
0                         0.0  2.588210e-97
                            Angstrom      Flux
lte012.0-5.0-0.0a+0_NIR                       
0                        9603.977409  0.004239
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-3.5-0.0a+0                        
0                         0.0  1.064880e-97
                            Angstrom      Flux
lte013.0-3.5-0.0a+0_NIR                       
0                        9603.977409  0.003739
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-5.5-0.0a+0                        
0                         0.0  3.850350e-97
                            Angstrom      Flux
lte012.0-5.5-0.0a+0_NIR                       
0                        9603.977409  0.004273
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte014.0-3.5-0.0a+0                        
0                         0.0  1.062920e-97
                            Angstrom      Flux
lte014.0-3.5-0.0a+0_NIR                       
0                        9603.977409  0.003688
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



Plot several NIR datasets in one

In [58]:
xx1 = NIR_new_loaded_FINAL_data_NEW[3].iloc[:, 0]
yy1 = NIR_new_loaded_FINAL_data_NEW[3].iloc[:, 1]
xx2 = NIR_new_loaded_FINAL_data_NEW[2].iloc[:, 0]
yy2 = NIR_new_loaded_FINAL_data_NEW[2].iloc[:, 1]
xx3 = NIR_new_loaded_FINAL_data_NEW[-5].iloc[:, 0]
yy3 = NIR_new_loaded_FINAL_data_NEW[-5].iloc[:, 1]

plt.figure(figsize=(15,8))
plt.plot(xx1.values, yy1.values, color='green')
plt.plot(xx2.values, yy2.values, color='red')
plt.plot(xx3.values, yy3.values, color='black')
plt.title("Data ready for POWER matrices, NIR", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([xx1.index.names[0], xx2.index.names[0], xx3.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=14, weight=0)
plt.yticks(fontsize=14, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()

Plot VIS values

In [59]:
VIS_new_final_file_names_NEW, VIS_new_final_file_paths_NEW = get_final_fileInfo122("VIS")
#VIS_new_final_file_names_NEW
In [60]:
pathsVIS = VIS_new_final_file_paths_NEW
namesVIS = VIS_new_final_file_names_NEW    
VIS_new_loaded_FINAL_data_NEW, VIS_new_loaded_final_NAMES_NEW = load_FINAL_data122(pathsVIS, namesVIS)

print("This many VIS files were loaded: {0}".format(len(VIS_new_loaded_FINAL_data_NEW)))
This many VIS files were loaded: 16
In [61]:
for i in range(len(VIS_new_final_file_names_NEW)):
    original_data = get_btsettl_data_byName(VIS_new_final_file_names_NEW[i])
    print(original_data.iloc[:1,:])

    xx_orig = original_data.iloc[100000:350000, 0].values
    yy_orig = original_data.iloc[100000:350000, 1].values
    plt.figure(figsize=(15,8))
    plt.plot(xx_orig, yy_orig)
    plt.title("Original Un-processed Data, VIS", fontsize=20, weight=0)
    plt.xlim(xx_orig[0], xx_orig[-1])
    plt.autoscale(enable=True, axis="x", tight=True)
    loc = matplotlib.ticker.LinearLocator(numticks=5)
    plt.xticks(fontsize=15, weight=0)
    plt.yticks(fontsize=15, weight=0)
    plt.gca().xaxis.set_major_locator(loc)
    plt.show()
    
    print(VIS_new_loaded_FINAL_data_NEW[i].iloc[:1,:])
    xx = VIS_new_loaded_FINAL_data_NEW[i].iloc[:, 0].values
    yy = VIS_new_loaded_FINAL_data_NEW[i].iloc[:, 1].values

    plt.figure(figsize=(15,8))
    plt.plot(xx, yy)
    plt.title("Pre-processed Data, VIS", fontsize=20, weight=0)
    plt.xticks(fontsize=15, weight=0)
    plt.yticks(fontsize=15, weight=0)
    plt.autoscale(enable=True, axis="x", tight=True)
    plt.show()
    print("=============================================================================================================")
    print("=============================================== STARTING NEW ================================================")
    print("=============================================================================================================")
    print()
    print()
    print()
    
    
                     Angstrom          Flux
lte013.0-3.0-0.0a+0                        
0                         0.0  8.018630e-98
                            Angstrom      Flux
lte013.0-3.0-0.0a+0_VIS                       
0                        5164.389835  0.006274
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-5.0-0.0a+0                        
0                         0.0  2.588210e-97
                            Angstrom      Flux
lte013.0-5.0-0.0a+0_VIS                       
0                        5164.389835  0.019821
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-5.5-0.0a+0                        
0                         0.0  3.850350e-97
                            Angstrom      Flux
lte012.0-5.5-0.0a+0_VIS                       
0                        5164.389835  0.020808
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte014.0-3.5-0.0a+0                        
0                         0.0  1.062920e-97
                            Angstrom      Flux
lte014.0-3.5-0.0a+0_VIS                       
0                        5164.389835  0.004693
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-4.5-0.0a+0                        
0                         0.0  1.856950e-97
                            Angstrom      Flux
lte012.0-4.5-0.0a+0_VIS                       
0                        5164.389835  0.021239
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-3.5-0.0a+0                        
0                         0.0  1.067580e-97
                            Angstrom      Flux
lte012.0-3.5-0.0a+0_VIS                       
0                        5164.389835  0.020811
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-4.0-0.0a+0                        
0                         0.0  1.386440e-97
                            Angstrom      Flux
lte012.0-4.0-0.0a+0_VIS                       
0                        5164.389835  0.021626
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-4.0-0.0a+0                        
0                         0.0  1.386440e-97
                            Angstrom      Flux
lte013.0-4.0-0.0a+0_VIS                       
0                        5164.389835  0.015751
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-2.5-0.0a+0                        
0                         0.0  5.804970e-98
                            Angstrom      Flux
lte012.0-2.5-0.0a+0_VIS                       
0                        5164.389835  0.007294
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-2.5-0.0a+0                        
0                         0.0  5.786290e-98
                            Angstrom      Flux
lte013.0-2.5-0.0a+0_VIS                       
0                        5164.389835  0.003635
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte014.0-3.0-0.0a+0                        
0                         0.0  7.985460e-98
                            Angstrom     Flux
lte014.0-3.0-0.0a+0_VIS                      
0                        5164.389835  0.00328
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-3.0-0.0a+0                        
0                         0.0  8.040810e-98
                            Angstrom      Flux
lte012.0-3.0-0.0a+0_VIS                       
0                        5164.389835  0.013342
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-5.5-0.0a+0                        
0                         0.0  3.850350e-97
                            Angstrom      Flux
lte013.0-5.5-0.0a+0_VIS                       
0                        5164.389835  0.019995
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-3.5-0.0a+0                        
0                         0.0  1.064880e-97
                            Angstrom      Flux
lte013.0-3.5-0.0a+0_VIS                       
0                        5164.389835  0.010286
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte013.0-4.5-0.0a+0                        
0                         0.0  1.856950e-97
                            Angstrom      Flux
lte013.0-4.5-0.0a+0_VIS                       
0                        5164.389835  0.015456
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



                     Angstrom          Flux
lte012.0-5.0-0.0a+0                        
0                         0.0  2.588210e-97
                            Angstrom      Flux
lte012.0-5.0-0.0a+0_VIS                       
0                        5164.389835  0.022017
=============================================================================================================
=============================================== STARTING NEW ================================================
=============================================================================================================



Plot several VIS datasets in one

In [62]:
x1 = VIS_new_loaded_FINAL_data_NEW[1].iloc[:, 0]
y1 = VIS_new_loaded_FINAL_data_NEW[1].iloc[:, 1]
x2 = VIS_new_loaded_FINAL_data_NEW[4].iloc[:, 0]
y2 = VIS_new_loaded_FINAL_data_NEW[4].iloc[:, 1]
x3 = VIS_new_loaded_FINAL_data_NEW[6].iloc[:, 0]
y3 = VIS_new_loaded_FINAL_data_NEW[6].iloc[:, 1]

plt.figure(figsize=(15,8))
plt.plot(x1.values, y1.values, color='green')
plt.plot(x2.values, y2.values, color='red')
plt.plot(x3.values, y3.values, color='black')
plt.title("Data ready for POWER matrices, VIS", fontsize=20, weight=0)
#plt.title("Final Pre-processed data, VIS", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([x1.index.names[0], x2.index.names[0], x3.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=14, weight=0)
plt.yticks(fontsize=14, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()

Image Creation (Spectograms)

In [64]:
def get_btsettl_READY_Info(NIR_or_VIS):
    """ Returns the file paths and names for the pre-processed bt-settl data. 
        'NIR_or_VIS' is a string, either 'NIR' or 'VIS'. """
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('The input is not correct. Enter "VIS" or "NIR" as strings. ')
    
    # Get the directory for each .dat file and store them in file_paths
    # Choose directory depending on the OS I'm running 
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/bt-settl (new version)/" + tp
    else:
        directory = "/home/salomonsson/Master Project/bt-settl (new version)/" + tp
        #directory = "/home/salomonsson/Master Project/bt-settl-CLEAN/" + tp
    
    btsettl_READY_paths = glob(directory + "*.dat")            

    # Get the .dat file names
    btsettl_READY_names = []
    for root, dirs, files in os.walk(directory):                                 
        for filename in files:
            if filename.endswith(".dat"):
                filename = filename[:-4]                     # Remove unneccessary "common" information from the name
                btsettl_READY_names.append(filename)
    return btsettl_READY_names, btsettl_READY_paths

btsettl_READY_names_NIR, btsettl_READY_paths_NIR = get_btsettl_READY_Info("NIR")
btsettl_READY_names_VIS, btsettl_READY_paths_VIS = get_btsettl_READY_Info("VIS")
    

# This function needed to be rewritten and changed even though it does the same thing on very similar input data as the 
# "load_all_data" function used in the very beginning of this document. I havent found a reason for why finding all 
# the "#" didnt worked as before. 
def load_btsettl_READY_data(name):
    """ Returns the btsettl dataframe with the name "name". """
    if "NIR" in name:
        paths = btsettl_READY_paths_NIR
    elif "VIS" in name:
        paths = btsettl_READY_paths_VIS
    else:
         print('Input is incorrect. Enter a file name.')  
            
    for path in paths:
        if name in path:
            data = pd.read_table(path, sep=",", header=None, skiprows=3)
            data.index.names = [name]                               # Add the file name to the dataframe  
            #limits = (data[data.values == '#'])                    # Find all the indeces with a "#" (Not working!)
            limits = {'# order: 0': 0}
            for i in range(len(data)):
                if "#" in data.iloc[i,0]:
                    limits[data.iloc[i,0]] = data.index[i]
                    
            keys = [key for key, value in limits.items()]
            values = [int(value) for key, value in limits.items()]            
            data_pieces, count = [], 0
            for i in range(len(limits)):
                start = values[count]
                if count == (len(limits)-1):
                    d_piece = data.iloc[start:,:]        # The last data piece contains the last order in the df data
                else:
                    d_piece = data.iloc[start:values[count+1],:]  # Select the correct data piece
                d_piece.index.names = [str(d_piece.index.names)[2:-2] + "_Order_" + str(count)] # add order to name
                d_piece.columns = ["Angstrom", "Flux"]
                d_piece = d_piece.dropna(inplace=False)             # Remove NaN-values
                #data_piece.dropna(inplace=True)     # Not like this to avoid a "SettingWithCopyWarning" error
                data_pieces.append(d_piece)
                count += 1
    return data_pieces


test_data = load_btsettl_READY_data(btsettl_READY_names_VIS[0])
In [65]:
btsettl_READY_names_VIS[0]
Out[65]:
'lte026+0.5+0.3a+0.0_VIS'

Check wheather all the orders exists in both the NIR and VIS files (its easier to check this in the carmenes data). If the orders dont match, remove those that are not common for all the data

In [66]:
# Check whether all the orders exists in both the NIR and VIS files. (Its easier to check this in the carmenes data)

def check_orders(data):
    """ data is a list with dataframes. Returns True if orders are okay, else False."""
    first = len(data[0])
    for d in range(1, len(data)):
        if len(data[d]) == len(data[0]):
            break
        else:
            print("Number of orders is not the same in all dataframes")
            return False
    print("Number of orders is equal")
    return True


# I skip this for now as the data is OK.
def remove_non_common_orders(data):
    """ data is a list with dataframes.
        If there are any orders that arent common for all dataframes, remove them. """
    # First, check how many orders there should be
    how_many = []
    for i in range(len(data)):
        how_many.append(len(data[i]))
    should_be = max(how_many)
    
    all_bad_lengths = []
    for l in range(len(data)):
        if len(data[l]) != should_be:                          # if length is not as it should be
            this_length = []
            for ll in range(len(data[l])):
                order = str(data[l][ll].index.names)[34:-2]    # get the order number
                this_length.append(order)
            all_bad_lengths.append(this_length)
    print(all_bad_lengths)
In [67]:
# wavelets documentation
# http://pycwt.readthedocs.io/en/latest/tutorial.html#time-series-spectral-analysis-using-wavelets

def get_POWER_matrix(data, rescale=True):
    """ Returns the power matrix, the scale indices and the Fourier frequencies.
        Set 'rescale' to False if rescaling is not desired. """
    t0, dt, N = 0, 0.5, data.size
    t = np.arange(0, N) * dt*2 + t0
    p = np.polyfit(t - t0, data, 1)
    dat_notrend = data - np.polyval(p, t - t0)
    std = dat_notrend.std()           # Standard deviation
    var = std ** 2                    # Variance
    dat_norm = dat_notrend / std      # Normalised dataset
    mother = wavelet.Morlet(6)        # Morlet transformation function
    
    # The following routines perform the wavelet transformation using the parameters defined above.
    wave, scales, freqs, coi, fft, fftfreqs = wavelet.cwt(dat_norm, dt, J=-1)
    power = (np.abs(wave)) ** 2
    if rescale == True:
        power *= 255.0/power.max()     # rescale 0-255
        power = power.astype('h')      # dtype = short
        scales *= 255.0/scales.max()
        scales = scales.astype('h')
        freqs *= 255.0/freqs.max()
        freqs = freqs.astype('h')
    return power, scales, freqs


def get_FULL_POWER_matrix(data):
    """ Returns the power matrix, the scale indices and the Fourier frequencies of "data". 
        All orders included.
    """
    power_matrix, full_scales, full_freqs = np.array([1]), np.array([1]), np.array([1])
    for i in range(len(data)):
        power, scales, freqs = get_POWER_matrix(data[i].loc[:, "Flux"])
        if len(power_matrix) > 10:                                # if no power matrix is created            
            power_matrix = np.append(power_matrix, power, axis=0)
            full_scales = np.append(full_scales, scales, axis=0)
            full_freqs = np.append(full_freqs, freqs, axis=0)
        else:        
            power_matrix = np.array(power)
            full_scales = np.array(scales)
            full_freqs = np.array(freqs)
    return power_matrix, full_scales, full_freqs


def plot_data(data, rescale=True):
    """ Plot the data for a single order. data is a one column array. """
    t0, dt, N = 0, 0.5, data.size
    t = np.arange(0, N) * dt*2 + t0
    p = np.polyfit(t - t0, data, 1)
    dat_notrend = data - np.polyval(p, t - t0)
    std = dat_notrend.std()          # Standard deviation
    var = std ** 2                   # Variance
    dat_norm = dat_notrend / std     # Normalised dataset
    mother = wavelet.Morlet(6)       # Morlet itransformation function
    s0 = 2 * dt                      # Starting scale, in this case 2 * 0.25 years = 6 months
    dj = 1 / 48                      # Twelve sub-octaves per octaves
    J = -1
    alpha, _, _ = wavelet.ar1(data)  # Lag-1 autocorrelation for red noise

    # The following routines perform the wavelet transformation using the parameters defined above. 
    wave, scales, freqs, coi, fft, fftfreqs = wavelet.cwt(dat_norm, dt, J=-1)
    power = (np.abs(wave)) ** 2
    if rescale == True:
        power *= 255.0/power.max()     # rescale 0-255
        power = power.astype('h')      # dtype = short
        
    
    period = 1 / freqs
    signif, fft_theor = wavelet.significance(1.0, dt, scales, 0, 
                                             alpha, significance_level=0.95, wavelet=mother)
    sig95 = np.ones([1, N]) * signif[:, None]
    sig95 = power / sig95

    # Finally, we plot our results.
    plt.close('all')
    plt.ioff()
    fig = plt.figure(figsize=(12.5, 40), dpi=80)

    # Second sub-plot, the normalised wavelet power spectrum and significance level contour 
    # lines and cone of influece hatched area. Note that period scale is logarithmic.
    bx = plt.axes([0.1, 0.37, 0.65, 0.28])
    levels = [0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16]
    
    bx.contourf(t, np.log2(period), np.log2(power), np.log2(levels), 
                extend='both', cmap=plt.cm.viridis)
    extent = [t.min(), t.max(), 0, max(period)]
    bx.contour(t, np.log2(period), sig95, [-99, 1], colors='k', linewidths=1, extent=extent)
    bx.set_ylabel('Period', fontsize=18)
    bx.set_xlabel('Index', fontsize=18)
    plt.autoscale(enable=True, axis="x", tight=True)
    Yticks = 2 ** np.arange(np.ceil(np.log2(period.min())), np.ceil(np.log2(period.max())))
    bx.set_yticks(np.log2(Yticks))
    bx.set_yticklabels(Yticks, fontsize=14)
    plt.xticks(fontsize=14)
    plt.title("POWER matrix (Spectrogram) for " + str(data.index.names)[5:-2], fontsize=18, weight=0)
    
    plt.show()
In [68]:
# Make sure the equal amount of orders exists in all NIR and VIS files respectively.
NIR_OK, VIS_OK = "NO", "NO"
if check_orders(all_loaded_data_NIR) == True:
    NIR_OK = "YES"
if check_orders(all_loaded_data_VIS) == True:
    VIS_OK = "YES"
Number of orders is equal
Number of orders is equal
In [69]:
test_data[3].loc[:, "Flux"].shape
Out[69]:
(2871,)

Plot the power matrix generated from one of the NIR files

In [70]:
# Plot the power matrices if the orders are equal in the files. 
if VIS_OK == "YES" and NIR_OK == "YES":
    for i in range(5):
        #print("POWER matrix (Spectrogram) for " + str(test_data[i+1].loc[:, "Flux"].index.names)[5:-2])
        plot_data(test_data[i+1].loc[:, "Flux"], rescale=False)
/home/salomonsson/anaconda3/envs/MP2/lib/python3.6/site-packages/scipy/fftpack/basic.py:160: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  z[index] = x

Create power matrices for the pre-processed bt-settl data

In [71]:
def create_MATRIX_path(name, NIR_or_VIS):
    """Create file path to store files. 'name' and 'NIR_or_VIS' are strings."""
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('Please enter "NIS" or "VIR" as second input parameter.')
        
    if "Darwin" in os.uname():
        base = "/Users/jakob/Desktop/Master Project/POWER_matrices/" + tp
    else:
        base = "/home/salomonsson/drive/POWER_matrices/" + tp
    return(base + name + ".csv")

NIR

In [5]:
### Each NIR file takes some 20 seconds to process. 335 min in total ###

btsettl_READY_names_NIR, btsettl_READY_paths_NIR = get_btsettl_READY_Info("NIR")

if "Darwin" in os.uname():
    d = "/Users/jakob/Desktop/Master Project/POWER_matrices/NIR/"
else:
    d = "/home/salomonsson/drive/POWER_matrices/NIR/"

nir_files = []
for dirpath, dirnames, files in os.walk(d):   # Get the amount of NIR files in the directory d
    for f in files:
        if "NIR" in f:
            nir_files.append(f) 
            
if len(nir_files) < (len(btsettl_READY_names_NIR)):   # if all the VIS files havent been "converted" to power matrices 
    start1 = time.time()
    if NIR_OK == "YES":
        for i in range(len(btsettl_READY_names_NIR)):      #len(btsettl_READY_names_NIR)
            NIR_name = btsettl_READY_names_NIR[i]
            NIR_d = load_btsettl_READY_data(NIR_name)
            NIR_m = get_FULL_POWER_matrix(NIR_d)[0]     # Get the Flux's power matrix
            NIR_path = create_MATRIX_path(NIR_name, 'NIR')
            pd.DataFrame(NIR_m, dtype='h').to_csv(NIR_path, header=True, index=False, mode="w")  # dtype = short
    
    # Check how long time it took.
    end1 = time.time()
    tt1 = end1 - start1
    print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
    print("All the power matrices for the pre-processed NIR files are already in the directory")   
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-f69755bdb109> in <module>()
      1 ### Each NIR file takes some 20 seconds to process. 335 min in total ###
      2 
----> 3 btsettl_READY_names_NIR, btsettl_READY_paths_NIR = get_btsettl_READY_Info("NIR")
      4 
      5 if "Darwin" in os.uname():

NameError: name 'get_btsettl_READY_Info' is not defined

VIS

In [ ]:
### Each VIS file takes some 30 seconds to process. ###

btsettl_READY_names_VIS, btsettl_READY_paths_VIS = get_btsettl_READY_Info("VIS")

if "Darwin" in os.uname():
    d = "/Users/jakob/Desktop/Master Project/POWER_matrices/VIS/"
else:
    d = "/home/salomonsson/drive/POWER_matrices/VIS/"  
    #d = "/home/salomonsson/drive/POWER_matrices_CLEAN/VIS/"

vis_files = []
for dirpath, dirnames, files in os.walk(d):        # Get the amount of VIS files in the directory d
    for f in files:
        if "VIS" in f:
            vis_files.append(f) 
            
if len(vis_files) < len(btsettl_READY_names_VIS):   # if all the VIS files havent been "converted" to power matrices 
    start1 = time.time()
    if VIS_OK == "YES":
        for i in range(len(btsettl_READY_names_VIS)):
            VIS_name = btsettl_READY_names_VIS[i]
            VIS_d = load_btsettl_READY_data(VIS_name)
            VIS_m = get_FULL_POWER_matrix(VIS_d)[0]
            VIS_path = create_MATRIX_path(VIS_name, 'VIS')
            pd.DataFrame(VIS_m, dtype='h').to_csv(VIS_path, header=True, index=False, mode="w")  # dtype = short

    # Check how long time it took.
    end1 = time.time()
    tt1 = end1 - start1
    print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
    print("All the power matrices for the pre-processed VIS files are already in the directory")

Start the CNN process

Create the model architecture

There are probably no advantages of using transfer learning as the images in our case differs so much from those used in a pre-trained model as ResNet, Xception or others.

In [13]:
# https://github.com/navoshta/behavioral-cloning/blob/master/model.py

def CNN_model_for_FULL(input_shape):
    """ Create the model architecture. """
    model = Sequential()
    
    model.add(Conv2D(16, (7, 7), input_shape=input_shape, activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(32, (7, 7), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(64, (7, 7), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
##############    
    model.add(Conv2D(128, (5, 5), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(256, (5, 5), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(256, (5, 5), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(1024, (3, 3), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(1024, (3, 3), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
###############    

    model.add(Flatten())
#    model.add(Dense(500, activation='relu'))    
    model.add(Dense(100, activation='relu'))
    model.add(Dense(20, activation='relu'))
    # As we're building a regressor, the last activation function needs to be linear
    # https://towardsdatascience.com/activation-functions-and-its-types-which-is-better-a9a5310cc8f
    model.add(Dense(3, activation="linear"))           # We want to determine 3 different parameters
    
    #model.compile(optimizer=optimizers.Adam(lr=1e-04), loss='mean_squared_error')
    # Important to note: for regression cases, one needs to use mse or mae as loss function. 
    # You can't use softmax as activation (since the output of the model isn't supposed to be probability)!
    
    # Ett annat alternativ ar att ta bort te tva sista Conv layers samt byta Dense 100 mot Dense 500.
    
    model.summary()
    return model


 

def CNN_model_FULL(input_shape):
    """ Create the model architecture. """
    model = Sequential()
    
    model.add(Conv2D(16, (7, 7), input_shape=input_shape, activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(32, (7, 7), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(64, (7, 7), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))    
 
    model.add(Conv2D(128, (5, 5), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(256, (5, 5), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(512, (5, 5), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(1024, (3, 3), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Conv2D(1024, (3, 3), activation='relu'))
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
   
    model.add(Dense(50, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(3, activation="linear"))           # We want to determine 3 different parameters
    
    model.summary()
    return model

model.add(Reshape((x, y))): If the output of the MP layer is [None, 10, 10], then the reshape layer must create a shape that is exactly 10*10=100 elements; a valid shape would be (100,) or (25, 4). Reshaping only reshapes the dimensions of the data, it does not add/remove extra data to guarantee the asked for output shape.

The shape (None, 2) simply means that the network accepts a batch with any number of elements that themselves are arrays of two elements (The first element is batch size in Keras shapes). So the correct input data shape would be (2,).

Define some useful functions for data and parameter extraction

In [14]:
def substring_before(s, delim):
    """ Divide the string 's' at 'delim' and return both what's before and after. """
    before = s.partition(delim)[0]
    rest = s.partition(delim)
    return before, rest


def get_MATRIX_paths(NIR_or_VIS):
    """ Return all the matrix paths for 'NIR' or 'VIS'. """
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('Please enter "NIS" or "VIR".')
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/POWER_matrices/" + tp
    else:
        directory = "/home/salomonsson/drive/POWER_matrices/" + tp
    return glob(directory + "*.csv")


def get_Matrix_and_Parameters(path):
    """ Returns the matrix stored in 'path', with the corresponding parameters 
        (Temperature, Surface Gravity and Metallicity) 
    """
    matrix = pd.read_csv(path)     # load the power matrix        
    parameters = []
    filename = os.path.basename(path)[:-4]
    name = filename
    
    # Process the file name and extract the individual parameters. Store them in a list.
    #####
    # Remove the alpha parameter and do some editing to the file name
    for l in str(filename):
        if 'a' in l:     # if 'a' is in the file name
            d = 'a'
            break
        else:
            d = '_'
    filename = substring_before(filename[3:], d)[0]
            
    # Extract the Temperature
    for l in str(filename[:6]):
        if '-' in l:
            d1 = '-'
            break
        elif '+' in l:
            d1 = '+'
            break
        else:
            pass
    p, rest = substring_before(filename, d1)
    parameters.append(float(p))     
            
    # Extract the Surface gravity, Log(G)
    for l in str(rest[2][:5]):
        if '-' in l:
            de = '-'
            break
        elif '+' in l:
            de = '+'
            break
        else:
            pass               
    p1, rest1 = substring_before(rest[2], de)           
    parameters.append(float(d1 + p1))

    # Extract the Metallicity
    for l in str(rest[2]):
        if '-' in l:
            de1 = '-'
            break
        elif '+' in l:
            de1 = '+'
            break
        else:
            pass 
    p2, rest2 = substring_before(rest[2], de1)
    parameters.append(float(rest2[1] + rest2[2]))
    #####
    #print(parameters)
    return matrix.values, np.array(parameters), name

Create MinMaxScalers to scale the parameters within the interval [0,1].

The training goes both faster and yield better results when scaled.

In [15]:
# Functions for scaling and rescaling the parameters. 

def MMscale_param(param, name):
    """ A function for scaling the parameters in param. name is the matrix name from where the parameters origin. """
    if "NIR" in name:
        scalers = many_MinMaxScalers_NIR_param
    elif "VIS" in name:
        scalers = many_MinMaxScalers_VIS_param
    else:
        print('Have you entered a valid name?')   
        
    scaled_param = scalers[name].fit_transform(param.reshape(-1, 1))
    return scaled_param


def Un_scale_data(scaled_param, name):
    """ A function for unscaling the parameters in the variable scaled_param. """
    if "NIR" in name:
        scalers = many_MinMaxScalers_NIR_param
    elif "VIS" in name:
        scalers = many_MinMaxScalers_VIS_param
    else:
        print('Have you entered a valid name?')
        
    un_scaled_param = scalers[name].inverse_transform(scaled_param.reshape(-1,1))
    return un_scaled_param


def normalise(array):
    """ Normalise array to [0, 1]. Decrease the precision to float32 to save memory. """
    return (255.0 - array.astype("float32")) / 255.0

# The difference in using float32 and float16 is less then 0.5% when using the full sized matriz. Test float16 as well!

Specify one scaler for each matrix data. Name it accordingly to the data matrix's name

In [16]:
NIR_paths = get_MATRIX_paths("NIR")
VIS_paths = get_MATRIX_paths("VIS")

many_MinMaxScalers_NIR_param, many_MinMaxScalers_VIS_param = {}, {}
for i_s in range(len(NIR_paths)):
    name = os.path.basename(NIR_paths[i_s])[:-4]
    many_MinMaxScalers_NIR_param["{0}".format(name)]=MinMaxScaler(feature_range=(0,1), copy=True)
    # Set copy=False to perform inplace row normalisation and avoid a copy
    
for i_v in range(len(VIS_paths)):
    name = os.path.basename(VIS_paths[i_v])[:-4]
    many_MinMaxScalers_VIS_param["{0}".format(name)]=MinMaxScaler(feature_range=(0,1), copy=True)    

Define some data Generators

Sequence are a safer way to do multiprocessing. This structure guarantees that the network will only train once on each sample per epoch which is not the case with generators. The Generator generates data as input to the algorithm as feeding it all the data at once isnt possible due to memory limitations.

In [17]:
### https://stackoverflow.com/questions/47200146/keras-load-images-batch-wise-for-large-dataset
### https://keras.io/utils/#sequence  + modifications
class My_Sequence(Sequence):
    """ Generates batches of training data and ground truth. 
        Inputs are the image paths and batch size. 
    """
    def __init__(self, image_paths, batch_size):
        self.image_paths, self.batch_size = image_paths, batch_size
    
    def __len__(self):
        return int(np.ceil(len(self.image_paths) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch = self.image_paths[idx * self.batch_size:(idx + 1) * self.batch_size]        
        matrices, parameters = [], []
        for file_path in batch:
            mat, param, name = get_Matrix_and_Parameters(file_path)
            
            #Transform the matrix from 2D to 3D as a (mat.shape[0], mat.shape[1]) RBG image. 
            # Rescale its values to [0,1]. Set "preserve_range=True" to not 
            # rescale the matrix, and by this saving memory and load time. 
            mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, 
                                                 mat.shape[1]//down_size, 3), 
                                           mode='constant', preserve_range=True) 
            param = MMscale_param(param, name)           # Rescale the parameters
            mat = normalise(mat)                         # Rescale the matrix to [0, 1]
            
            matrices.append(mat)
            parameters.append(param)
        MAT, PAM = np.array(matrices), np.array(parameters)
        PAM = np.reshape(PAM, (PAM.shape[0], PAM.shape[1]))   
        
        gc.collect()                                     # Garbage Collector
        
        return MAT, PAM
                

def Data_Generator(image_paths, batch_size=16):
    """ Generates batches of training data and ground truth. Inputs are the image paths and batch size. """
    
    L = len(image_paths)
    
    #this line is just to make the generator infinite, keras needs that    
    while True:
        batch_start = 0
        batch_end = batch_size

        while batch_start < L:
            matrices, parameters = [], []
            
            # Select the batch
            batch = image_paths[batch_start:batch_end]      
            limit = min(batch_end, L)
            
            for path in batch:
                # Get the matrix, parameters and the name
                mat, param, name = get_Matrix_and_Parameters(path)
                
                #Transform the matrix from 2D to 3D as a (mat.shape[0], mat.shape[1]) RBG image. Rescale to [0,1]
                # Set "preserve_range=True" to not rescale the matrix, and by this saving memory and load time. 
                # Results from training might be worse though.
                mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3), 
                                               mode='constant', preserve_range=True)
                
                # Scale and normalise the parameters and the matrix
                param = MMscale_param(param, name)
                mat = normalise(mat)
                
                matrices.append(mat)
                parameters.append(param)
            
            # Convert the lists into arrays and reshape the parameter array to (batch size, number of parameters)
            MAT, PAM = np.array(matrices), np.array(parameters)
            PAM = np.reshape(PAM, (PAM.shape[0], PAM.shape[1])) 
            
            # Output the matrix and parameters array. Delete the data from RAM after it's been used to save memory
            yield MAT, PAM

            batch_start += batch_size   
            batch_end += batch_size
            gc.collect()   # Garbage Collector
            

Pre-training steps: NIR



Divide into training and validation sets

In [14]:
# Load and shuffle the paths randomly
matrix_paths_NIR = get_MATRIX_paths('NIR')
random.Random(32).shuffle(matrix_paths_NIR)                                    # Shuffle the same way each time

training_paths_NIR = matrix_paths_NIR[:int(len(matrix_paths_NIR)*0.8)]         # train on 80% of the data set
validation_paths_NIR = matrix_paths_NIR[int(len(matrix_paths_NIR)*0.8):int(len(matrix_paths_NIR)*0.9)]  # validate on 10%
test_paths_NIR = matrix_paths_NIR[int(len(matrix_paths_NIR)*0.9):]             # test on 10%

print("Number of training samples for NIR:   ", len(training_paths_NIR))
print("Number of validation samples for NIR: ", len(validation_paths_NIR))
print("Number of test samples for NIR:       ", len(test_paths_NIR))
Number of training samples for NIR:    819
Number of validation samples for NIR:  102
Number of test samples for NIR:        103
In [15]:
# Copy the test files to another folder
#for i in range(60):
#    shutil.copy(test_paths_NIR[i], "/home/salomonsson/Master Project/temporary/NIR/")

Set Matrix size

In [18]:
# Down size the matrix this much. down_size = 1 equals no scaling. However, this doesnt fit into the GPU memory!
down_size = 2

Create the NIR model

In [19]:
NIR_model = CNN_model_FULL(input_shape=(3724//down_size, 4073//down_size, 3))

# Divide the input shape by 8 and remove the last three layers (1024, 2048, 4096) in the CNN_model results
# in very few trainable parameters (8M, instead of 203). Try it!?
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_1 (Conv2D)            (None, 1856, 2030, 16)    2368      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 928, 1015, 16)     0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 922, 1009, 32)     25120     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 461, 504, 32)      0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 455, 498, 64)      100416    
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 227, 249, 64)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 223, 245, 128)     204928    
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 111, 122, 128)     0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 107, 118, 256)     819456    
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 53, 59, 256)       0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 49, 55, 512)       3277312   
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 24, 27, 512)       0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 22, 25, 1024)      4719616   
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 11, 12, 1024)      0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 9, 10, 1024)       9438208   
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 4, 5, 1024)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20480)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                1024050   
_________________________________________________________________
dense_2 (Dense)              (None, 20)                1020      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 63        
=================================================================
Total params: 19,612,557
Trainable params: 19,612,557
Non-trainable params: 0
_________________________________________________________________

Create a generator for the training set and validation set respectively. Compile the NIR model

In [18]:
batch_size = 2     # (When running with larger batch size on GPU, a ResourceExhaustedError is thrown.) 

my_training_batch_generator_NIR = Data_Generator(training_paths_NIR, batch_size)        # training generator
my_validation_batch_generator_NIR = Data_Generator(validation_paths_NIR, batch_size)    # validation generator
my_test_batch_generator_NIR = Data_Generator(test_paths_NIR, batch_size)                # testing generator


NIR_model.compile(optimizer=optimizers.Adam(lr=0.0001), loss='mean_squared_error', metrics=['mse'])
# Important to note: for regression cases, one needs to use mse or mae as loss function. 

Train the NIR model

Before that, create a checkpointer to store the best model.


When using the GPU for CNN training between different Jupyter notebooks, cleaning the GPU memory between the runs might be necessary. Do so by first calling sudo fuser -v /dev/nvidia* in a Terminal window before executing sudo kill -xxxxx PID, where xxxxx is the process' PID number.


In [ ]:
def fxn():
    """ Define a function to ignore a UserWarning related to precision loss when converting from int64 to float64."""
    warnings.warn("Possible precision loss when converting from int64 to float64", UserWarning)


if "Darwin" in os.uname():
    model_path = "/Users/jakob/Desktop/Master Project/Models/NIR_model.weights.best.hdf5"
    log_path = "/Users/jakob/Desktop/Master Project/Models/NIR_training.log"
else:
    model_path = "/home/salomonsson/Master Project/Models/NIR_model.weights.best.hdf5"
    log_path = "/home/salomonsson/Master Project/Models/NIR_training.log"
In [ ]:
# Load the model to continue the training
#NIR_model.load_weights(model_path)
In [ ]:
start = time.time()   # Time it
num_epochs = 100

with warnings.catch_warnings():                                                        # Catch and ignore the warning
    warnings.simplefilter("ignore")
    fxn() 
    csv_logger = CSVLogger(log_path, separator=',', append=False)   # Log the training
    checkpointer = ModelCheckpoint(filepath=model_path, 
                                   verbose=1, 
                                   save_best_only=True)                                # Save the best model 
    architecture = TensorBoard(log_dir='./Graph/NIR', histogram_freq=0, 
                               write_graph=True, write_images=True)                    # Save the CNN Architecture
    
    history = NIR_model.fit_generator(generator=my_training_batch_generator_NIR,
                                    steps_per_epoch=(len(validation_paths_NIR) // batch_size),
                                    epochs=num_epochs,
                                    verbose=1,
                                    callbacks=[checkpointer, csv_logger, architecture],
                                    validation_data=my_validation_batch_generator_NIR, 
                                    validation_steps=(len(validation_paths_NIR) // batch_size), 
                                    use_multiprocessing=True, 
                                    max_queue_size=3,
                                    workers=10,
                                    shuffle=True)

# Print total training time
end = time.time()
tt = end-start
print()
print("Total training time was {0:0.0f} min and {1:0.0f}s. ".format((tt-tt%60)/60, tt%60))

# max_queue_size : It specifies how many batches it’s going to prepare in the queue.
# It doesn’t mean you’ll have multiple generator instances, nor that the training starts only when the queue is filled.  

# If your targets are one-hot encoded, use categorical_crossentropy. "fit_generator" one-hot encodes 
# the input data automatically!
# But if your targets are integers, use sparse_categorical_crossentropy. 

# Increasing the number of "workers" might result in better GPU Utilisation as there are more CPUs 
# who feed the GPU with data. 


#backend.clear_session()
In [ ]:
# tensorboard --logdir=/home/salomonsson/Master\ Project/Graph/

Display the Architecture in TensorBoard

In [ ]:
architecture.set_model(VIS_model)

# Then run "tensorboard --logdir .Graph/VIS/" in Terminal

Plot Error and Loss per Epoch for NIR

In [ ]:
# (MSE and Loss is the same)
# Load the training history from file
log_data = pd.read_csv(log_path, sep=',', engine='python')
print(log_data.columns)

log_data = log_data.iloc[1:, :]

# Plot for the Error
plt.figure(figsize=(14,7))
plt.plot(log_data['loss'], color="black")
plt.plot(log_data['val_loss'], color="red")
plt.title("Mean Squared Error (NIR)", fontsize=20)
plt.ylabel("MSE", fontsize=17)
plt.xlabel("Epochs", fontsize=17)
plt.legend(["Training error:    " + str(log_data.iloc[-1,-3])[:8], 
            "Validation error: " + str(log_data.iloc[-1,-1])[:8]], 
           loc="upper right", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
In [ ]:
#print(history.history.keys())
#print(history.history['mean_squared_error'])

Load the NIR Model with Least Validation Loss

In [ ]:
if "Darwin" in os.uname():
    model_path = "/Users/jakob/Desktop/Master Project/Models/NIR_model.weights.best.hdf5"
    pred_path = "/Users/jakob/Desktop/Master Project/Predictions/Some_preds_NIR.txt"
else:
    model_path = "/home/salomonsson/Master Project/Models/NIR_model.weights.best.hdf5"
    pred_path = "/home/salomonsson/Master Project/Predictions/Some_preds_NIR.txt"
    
NIR_model.load_weights(model_path)

Make NIR predictions

In [ ]:
np.set_printoptions(precision=3)
list_for_saving = []

print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
for path in range(50):
    # Load the matrix, parameters and the name. Resize the matrix
    mat, param, name = get_Matrix_and_Parameters(test_paths_NIR[path])  
    mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3), 
                                           mode='constant', preserve_range=True)
    
    # Reshape the matrix to "[batch, 3D RGB image]"
    mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
    
    # Rescale the matrix to [0,1]
    mat = normalise(mat)                                                  
    
    # Fit the scaler on the parameters
    scaled_param = MMscale_param(param, name)
    
    # Use the model to predict the parameters.
    predicted_parameters = NIR_model.predict(mat)
    
    # Use the fitted scaler to unscale the parameters to their original interval
    unscaled_pred_param = Un_scale_data(predicted_parameters, name)
    
    # Calculate the difference in percent and store the values in a list
    diff = (abs(unscaled_pred_param.reshape(1,-1)-param))/abs(unscaled_pred_param.reshape(1,-1))
    
    # Prepare for saving to file
    d1 = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
    d2 = pd.DataFrame(param.reshape(1, -1))
    d = pd.concat([d1, d2], axis=1)
    list_for_saving.append(d)
    
    # Print the results (transpose the unscaled predicted parameters for readability).
    print("Predicted: {0}, Real values: {1}, DIFF: {2}".format(np.transpose(unscaled_pred_param)[0], param, diff))
    print()


# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Pred. Temperature", "Pred Log(g)", "Pred Metallicity", 
           "True Temperature", "True Log(g)", "True Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")
print()
print("End")

Create Bar Chart NIR

In [ ]:
# Load the data first
loaded_predictions_NIR = pd.read_csv(pred_path, sep=',', engine='python')

def autolabel(rects, color='black', extra_height=0):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        height1 = height + extra_height
        ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
                '{0:2.0f}'.format(float(height)),
                ha='center', va='bottom', color=color)
        

def autolabel_2(rects, color='black', extra_height=0):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        height1 = height + extra_height
        ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
                '{0:2.1f}'.format(float(height)),
                ha='center', va='bottom', color=color)
        
In [ ]:
# data to plot
n_groups = 10
pred_temp = loaded_predictions_NIR.loc[1:10, "Pred. Temperature"] * 100
true_temp = loaded_predictions_NIR.loc[1:10, "True Temperature"] * 100
    
# create plot
fig, ax = plt.subplots(figsize=(7,5))
index = np.arange(n_groups)
bar_width = 0.15
opacity = 1
 
rects1 = plt.bar(index, pred_temp, bar_width,
                 alpha=opacity,
                 color='black',
                 label='Predicted Effective Temperature')
 
rects2 = plt.bar(index + bar_width, true_temp, bar_width,
                 alpha=opacity,
                 color='red',
                 label='True Temperature')
 
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Temperature (K)', fontsize=17)
plt.title('Eff. Temp. NIR, down x20', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
#plt.legend(loc="lower right", fontsize=15, framealpha=0.9)

plt.tight_layout()
plt.show()
In [ ]:
# data to plot
n_groups = 10
#pred_log = np.power(10, -1*loaded_predictions_NIR.loc[1:10, "Pred Log(g)"])
#true_log = np.power(10, -1*loaded_predictions_NIR.loc[1:10, "True Log(g)"])

# Lets go for the normal ones
pred_log = -1*loaded_predictions_NIR.loc[1:10, "Pred Log(g)"] 
true_log = -1*loaded_predictions_NIR.loc[1:10, "True Log(g)"]

# create plot
fig, ax = plt.subplots(figsize=(7,5))
index = np.arange(n_groups)
bar_width = 0.15
opacity = 1
 
rects1 = plt.bar(index, pred_log, bar_width,
                 alpha=opacity,
                 color='black',
                 label='Predicted Surface Gravity')
 
rects2 = plt.bar(index + bar_width, true_log, bar_width,
                 alpha=opacity,
                 color='red',
                 label='True Surface Gravity')
 
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Surface Gravity', fontsize=17)
plt.title('Surface Gravity NIR, down x20', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
#plt.legend(loc="upper right", fontsize=15, framealpha=0.9)

#autolabel_2(rects1, extra_height=-.15)
#autolabel_2(rects2, 'red', extra_height=-.15)  
 
plt.tight_layout()
plt.show()
In [ ]:
# data to plot
n_groups = 10
pred_met = loaded_predictions_NIR.loc[1:10, "Pred Metallicity"]
true_met = loaded_predictions_NIR.loc[1:10, "True Metallicity"]

# create plot
fig, ax = plt.subplots(figsize=(7,5))
index = np.arange(n_groups)
bar_width = 0.15
opacity = 1
 
rects1 = plt.bar(index, pred_met, bar_width,
                 alpha=opacity,
                 color='black',
                 label='Predicted Metallicity')
 
rects2 = plt.bar(index + bar_width, true_met, bar_width,
                 alpha=opacity,
                 color='red',
                 label='True Metallicity')
 
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Metallicity', fontsize=17)
plt.title('Metallicity NIR, down x20', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
#plt.legend(loc="lower right", fontsize=15, framealpha=0.9)
     
#autolabel_2(rects1, extra_height=-.15)
#autolabel_2(rects2, 'red', extra_height=-.15)

plt.tight_layout()
plt.show()

Evaluate the NIR Model

In [ ]:
loss_and_metrics = NIR_model.evaluate_generator(my_test_batch_generator_NIR, 
                                                steps=60, 
                                                max_queue_size=1, 
                                                workers=5, 
                                                use_multiprocessing=True, 
                                                verbose=1)
In [ ]:
print(NIR_model.metrics_names)
print(loss_and_metrics)

Calculate the Error over the entire Testing set for each one of the parameters for NIR

In [ ]:
# Load the data first
loaded_predictions_NIR = pd.read_csv(pred_path, sep=',', engine='python')
In [ ]:
# Remove 0-values
loaded_predictions_NIR = loaded_predictions_NIR[loaded_predictions_NIR.loc[:, "True Log(g)"] != 0.0]
loaded_predictions_NIR = loaded_predictions_NIR[loaded_predictions_NIR.loc[:, "True Metallicity"] != 0.0]
In [ ]:
# Calculate differences between the predicted and true values
loaded_predictions_NIR.loc[:, "Diff Temp (%)"] = (loaded_predictions_NIR.loc[:, "Pred. Temperature"]\
                    -loaded_predictions_NIR.loc[:, "True Temperature"])/loaded_predictions_NIR.loc[:, "Pred. Temperature"]

loaded_predictions_NIR.loc[:, "Diff Log(g) (%)"] = (loaded_predictions_NIR.loc[:, "Pred Log(g)"]\
                    -loaded_predictions_NIR.loc[:, "True Log(g)"])/loaded_predictions_NIR.loc[:, "Pred Log(g)"]

loaded_predictions_NIR.loc[:, "Diff Met (%)"] = (loaded_predictions_NIR.loc[:, "Pred Metallicity"]\
                    -loaded_predictions_NIR.loc[:, "True Metallicity"])/loaded_predictions_NIR.loc[:, "Pred Metallicity"]
In [ ]:
diff_log_NIR = copy.deepcopy(loaded_predictions_NIR)
# Keep only the data within +2 to -2 of standard deviations.

diff_log_NIR = diff_log_NIR[(np.abs(diff_log_NIR["Diff Temp (%)"]-diff_log_NIR["Diff Temp (%)"].mean())\
                      <= (2*diff_log_NIR["Diff Temp (%)"].std()))]

diff_log_NIR = diff_log_NIR[(np.abs(diff_log_NIR["Diff Log(g) (%)"]-diff_log_NIR["Diff Log(g) (%)"].mean())\
                      <= (2*diff_log_NIR["Diff Log(g) (%)"].std()))]

diff_log_NIR = diff_log_NIR[(np.abs(diff_log_NIR["Diff Met (%)"]-diff_log_NIR["Diff Met (%)"].mean())\
                      <= (2*diff_log_NIR["Diff Met (%)"].std()))]
display(diff_log_NIR.head(20))
In [ ]:
print("NIR")
print("=================")
print()

# Print the Mean error
print("MSE for Temperature:  {0:2.6f}".format(np.power(diff_log_NIR["Diff Temp (%)"].mean(), 2)))
print("MSE for Gravity:      {0:2.6f}".format(np.power(diff_log_NIR["Diff Log(g) (%)"].mean(), 2)))
print("MSE for Metallicity:  {0:2.6f}".format(np.power(diff_log_NIR["Diff Met (%)"].mean(), 2)))
print()

# Print the median error
print("Median error for Temperature:  {0:2.6f}".format(np.abs(diff_log_NIR["Diff Temp (%)"].median())))
print("Median error for Gravity:      {0:2.6f}".format(np.abs(diff_log_NIR["Diff Log(g) (%)"].median())))
print("Median error for Metallicity:  {0:2.6f}".format(np.abs(diff_log_NIR["Diff Met (%)"].median())))
In [ ]:
# Clear the session as a preperation for the next one. 
backend.clear_session()

Pre-training steps: VIS



Divide into training and validation sets

In [19]:
# Load and shuffle the paths randomly
matrix_paths_VIS = get_MATRIX_paths('VIS')
random.Random(32).shuffle(matrix_paths_VIS)                                    # Shuffle the same way each time (use 32)

training_paths_VIS = matrix_paths_VIS[:int(len(matrix_paths_VIS)*0.8)]         # train on 80% of the data set
validation_paths_VIS = matrix_paths_VIS[int(len(matrix_paths_VIS)*0.8):int(len(matrix_paths_VIS)*0.9)]  #validate on 10%
test_paths_VIS = matrix_paths_VIS[int(len(matrix_paths_VIS)*0.9):]             # test on 10%

print("Number of training samples for NIR:   ", len(training_paths_VIS))
print("Number of validation samples for NIR: ", len(validation_paths_VIS))
print("Number of test samples for NIR:       ", len(test_paths_VIS))
Number of training samples for NIR:    816
Number of validation samples for NIR:  102
Number of test samples for NIR:        102
In [20]:
## Copy the test files to another folder
#folderPath = "/home/salomonsson/Master Project/temporary/VIS/" #to get the path of the folder
#for i in range(60):
#    shutil.copy(test_paths_VIS[i], folderPath)

Create the VIS model

In [21]:
7747//down_size
Out[21]:
3873
In [22]:
# Down size the matrix this much. down_size = 1 equals no scaling. However, this doesnt fit into the GPU memory!
down_size = 3      # 2 doesnt work for GPU memory reasons
In [23]:
VIS_model = CNN_model_FULL(input_shape=(7747//down_size, 2871//down_size, 3))

# Divide the input shape by 8 and remove the last three layers (1024, 2048, 4096) in the CNN_model results
# in very few trainable parameters (8M, instead of 203). Try it!?
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_9 (Conv2D)            (None, 2576, 951, 16)     2368      
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 1288, 475, 16)     0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 1282, 469, 32)     25120     
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 641, 234, 32)      0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 635, 228, 64)      100416    
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 317, 114, 64)      0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 313, 110, 128)     204928    
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 156, 55, 128)      0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 152, 51, 256)      819456    
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 76, 25, 256)       0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 72, 21, 512)       3277312   
_________________________________________________________________
max_pooling2d_14 (MaxPooling (None, 36, 10, 512)       0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 34, 8, 1024)       4719616   
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 17, 4, 1024)       0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 15, 2, 1024)       9438208   
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 7, 1, 1024)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 7168)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 50)                358450    
_________________________________________________________________
dense_5 (Dense)              (None, 20)                1020      
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 63        
=================================================================
Total params: 18,946,957
Trainable params: 18,946,957
Non-trainable params: 0
_________________________________________________________________

Create a generator for the training set and validation set respectively. Compile the VIS model

In [24]:
batch_size = 2     # (When running with larger batch size on GPU, a ResourceExhaustedError is thrown.) 

my_training_batch_generator_VIS = Data_Generator(training_paths_VIS, batch_size)        # training generator
my_validation_batch_generator_VIS = Data_Generator(validation_paths_VIS, batch_size)    # validation generator
my_test_batch_generator_VIS = Data_Generator(test_paths_VIS, batch_size)                # testing generator


VIS_model.compile(optimizer=optimizers.Adam(lr=0.0001), loss='mean_squared_error', metrics=['mse'])
# Important to note: for regression cases, one needs to use mse or mae as loss function. 

Train the VIS model

Before that, create a checkpointer to store the best model.


When using the GPU for CNN training between different Jupyter notebooks, cleaning the GPU memory between the runs might be necessary. Do so by first calling sudo fuser -v /dev/nvidia* in a Terminal window before executing sudo kill -xxxxx PID, where xxxxx is the process' PID number.


In [85]:
def fxn():
    """ Define a function to ignore a UserWarning related to precision loss when converting from int64 to float64."""
    warnings.warn("Possible precision loss when converting from int64 to float64", UserWarning)


if "Darwin" in os.uname():
    model_path = "/Users/jakob/Desktop/Master Project/Models/VIS_model.weights.best.hdf5"
    log_path = "/Users/jakob/Desktop/Master Project/Models/VIS_training.log"
else:
    model_path = "/home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5"
    log_path = "/home/salomonsson/Master Project/Models/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.log"
In [16]:
# Load the model to continue the training
#VIS_model.load_weights(model_path)
In [17]:
start = time.time()   # Time it
num_epochs = 100

with warnings.catch_warnings():                                                        # Catch and ignore a warning
    warnings.simplefilter("ignore")
    fxn() 
    csv_logger = CSVLogger(log_path, separator=',', append=False)   # Log the training
    checkpointer = ModelCheckpoint(filepath=model_path, 
                                   verbose=1, 
                                   save_best_only=True)                                # Save the best model only
    architecture = TensorBoard(log_dir='./Graph/VIS/', histogram_freq=0, 
                               write_graph=True, write_images=True)                    # Save the CNN Architecture
    
    history = VIS_model.fit_generator(generator=my_training_batch_generator_VIS,
                                    steps_per_epoch=(len(validation_paths_VIS) // batch_size),
                                    epochs=num_epochs,
                                    verbose=1,
                                    callbacks=[checkpointer, csv_logger, architecture],
                                    validation_data=my_validation_batch_generator_VIS, 
                                    validation_steps=(len(validation_paths_VIS) // batch_size), 
                                    use_multiprocessing=True, 
                                    max_queue_size=2,
                                    workers=10,
                                    shuffle=True)#,
                                    #initial_epoch=6)             # start training from here

# Print total training time
end = time.time()
tt = end-start
print()
print("Total training time was {0:0.0f} min and {1:0.0f}s. ".format((tt-tt%60)/60, tt%60))
Epoch 1/100
51/51 [==============================] - 217s 4s/step - loss: 0.0375 - mean_squared_error: 0.0375 - val_loss: 0.0044 - val_mean_squared_error: 0.0044

Epoch 00001: val_loss improved from inf to 0.00443, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 2/100
51/51 [==============================] - 182s 4s/step - loss: 0.0014 - mean_squared_error: 0.0014 - val_loss: 0.0032 - val_mean_squared_error: 0.0032

Epoch 00002: val_loss improved from 0.00443 to 0.00316, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 3/100
51/51 [==============================] - 187s 4s/step - loss: 0.0018 - mean_squared_error: 0.0018 - val_loss: 0.0033 - val_mean_squared_error: 0.0033

Epoch 00003: val_loss did not improve from 0.00316
Epoch 4/100
51/51 [==============================] - 178s 3s/step - loss: 0.0033 - mean_squared_error: 0.0033 - val_loss: 0.0027 - val_mean_squared_error: 0.0027

Epoch 00004: val_loss improved from 0.00316 to 0.00267, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 5/100
51/51 [==============================] - 191s 4s/step - loss: 0.0018 - mean_squared_error: 0.0018 - val_loss: 0.0029 - val_mean_squared_error: 0.0029

Epoch 00005: val_loss did not improve from 0.00267
Epoch 6/100
51/51 [==============================] - 171s 3s/step - loss: 0.0050 - mean_squared_error: 0.0050 - val_loss: 0.0047 - val_mean_squared_error: 0.0047

Epoch 00006: val_loss did not improve from 0.00267
Epoch 7/100
51/51 [==============================] - 184s 4s/step - loss: 0.0035 - mean_squared_error: 0.0035 - val_loss: 0.0013 - val_mean_squared_error: 0.0013

Epoch 00007: val_loss improved from 0.00267 to 0.00127, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 8/100
51/51 [==============================] - 186s 4s/step - loss: 0.0025 - mean_squared_error: 0.0025 - val_loss: 0.0026 - val_mean_squared_error: 0.0026

Epoch 00008: val_loss did not improve from 0.00127
Epoch 9/100
51/51 [==============================] - 178s 3s/step - loss: 0.0025 - mean_squared_error: 0.0025 - val_loss: 0.0079 - val_mean_squared_error: 0.0079

Epoch 00009: val_loss did not improve from 0.00127
Epoch 10/100
51/51 [==============================] - 201s 4s/step - loss: 0.0028 - mean_squared_error: 0.0028 - val_loss: 0.0036 - val_mean_squared_error: 0.0036

Epoch 00010: val_loss did not improve from 0.00127
Epoch 11/100
51/51 [==============================] - 173s 3s/step - loss: 0.0018 - mean_squared_error: 0.0018 - val_loss: 0.0061 - val_mean_squared_error: 0.0061

Epoch 00011: val_loss did not improve from 0.00127
Epoch 12/100
51/51 [==============================] - 182s 4s/step - loss: 0.0021 - mean_squared_error: 0.0021 - val_loss: 0.0017 - val_mean_squared_error: 0.0017

Epoch 00012: val_loss did not improve from 0.00127
Epoch 13/100
51/51 [==============================] - 179s 4s/step - loss: 0.0013 - mean_squared_error: 0.0013 - val_loss: 0.0032 - val_mean_squared_error: 0.0032

Epoch 00013: val_loss did not improve from 0.00127
Epoch 14/100
51/51 [==============================] - 182s 4s/step - loss: 0.0031 - mean_squared_error: 0.0031 - val_loss: 0.0022 - val_mean_squared_error: 0.0022

Epoch 00014: val_loss did not improve from 0.00127
Epoch 15/100
51/51 [==============================] - 167s 3s/step - loss: 0.0012 - mean_squared_error: 0.0012 - val_loss: 0.0046 - val_mean_squared_error: 0.0046

Epoch 00015: val_loss did not improve from 0.00127
Epoch 16/100
51/51 [==============================] - 176s 3s/step - loss: 0.0019 - mean_squared_error: 0.0019 - val_loss: 0.0048 - val_mean_squared_error: 0.0048

Epoch 00016: val_loss did not improve from 0.00127
Epoch 17/100
51/51 [==============================] - 201s 4s/step - loss: 0.0016 - mean_squared_error: 0.0016 - val_loss: 0.0023 - val_mean_squared_error: 0.0023

Epoch 00017: val_loss did not improve from 0.00127
Epoch 18/100
51/51 [==============================] - 228s 4s/step - loss: 0.0030 - mean_squared_error: 0.0030 - val_loss: 0.0035 - val_mean_squared_error: 0.0035

Epoch 00018: val_loss did not improve from 0.00127
Epoch 19/100
51/51 [==============================] - 169s 3s/step - loss: 0.0033 - mean_squared_error: 0.0033 - val_loss: 0.0015 - val_mean_squared_error: 0.0015

Epoch 00019: val_loss did not improve from 0.00127
Epoch 20/100
51/51 [==============================] - 211s 4s/step - loss: 0.0016 - mean_squared_error: 0.0016 - val_loss: 0.0030 - val_mean_squared_error: 0.0030

Epoch 00020: val_loss did not improve from 0.00127
Epoch 21/100
51/51 [==============================] - 206s 4s/step - loss: 5.0496e-04 - mean_squared_error: 5.0496e-04 - val_loss: 0.0027 - val_mean_squared_error: 0.0027

Epoch 00021: val_loss did not improve from 0.00127
Epoch 22/100
51/51 [==============================] - 198s 4s/step - loss: 0.0020 - mean_squared_error: 0.0020 - val_loss: 0.0017 - val_mean_squared_error: 0.0017

Epoch 00022: val_loss did not improve from 0.00127
Epoch 23/100
51/51 [==============================] - 186s 4s/step - loss: 0.0017 - mean_squared_error: 0.0017 - val_loss: 0.0034 - val_mean_squared_error: 0.0034

Epoch 00023: val_loss did not improve from 0.00127
Epoch 24/100
51/51 [==============================] - 197s 4s/step - loss: 0.0025 - mean_squared_error: 0.0025 - val_loss: 0.0032 - val_mean_squared_error: 0.0032

Epoch 00024: val_loss did not improve from 0.00127
Epoch 25/100
51/51 [==============================] - 198s 4s/step - loss: 0.0014 - mean_squared_error: 0.0014 - val_loss: 0.0027 - val_mean_squared_error: 0.0027

Epoch 00025: val_loss did not improve from 0.00127
Epoch 26/100
51/51 [==============================] - 180s 4s/step - loss: 0.0023 - mean_squared_error: 0.0023 - val_loss: 0.0050 - val_mean_squared_error: 0.0050

Epoch 00026: val_loss did not improve from 0.00127
Epoch 27/100
51/51 [==============================] - 172s 3s/step - loss: 0.0026 - mean_squared_error: 0.0026 - val_loss: 0.0045 - val_mean_squared_error: 0.0045

Epoch 00027: val_loss did not improve from 0.00127
Epoch 28/100
51/51 [==============================] - 183s 4s/step - loss: 0.0021 - mean_squared_error: 0.0021 - val_loss: 0.0028 - val_mean_squared_error: 0.0028

Epoch 00028: val_loss did not improve from 0.00127
Epoch 29/100
51/51 [==============================] - 190s 4s/step - loss: 9.5954e-04 - mean_squared_error: 9.5954e-04 - val_loss: 0.0032 - val_mean_squared_error: 0.0032

Epoch 00029: val_loss did not improve from 0.00127
Epoch 30/100
51/51 [==============================] - 192s 4s/step - loss: 0.0037 - mean_squared_error: 0.0037 - val_loss: 0.0051 - val_mean_squared_error: 0.0051

Epoch 00030: val_loss did not improve from 0.00127
Epoch 31/100
51/51 [==============================] - 185s 4s/step - loss: 0.0014 - mean_squared_error: 0.0014 - val_loss: 0.0028 - val_mean_squared_error: 0.0028

Epoch 00031: val_loss did not improve from 0.00127
Epoch 32/100
51/51 [==============================] - 179s 4s/step - loss: 9.2768e-04 - mean_squared_error: 9.2768e-04 - val_loss: 0.0025 - val_mean_squared_error: 0.0025

Epoch 00032: val_loss did not improve from 0.00127
Epoch 33/100
51/51 [==============================] - 171s 3s/step - loss: 0.0015 - mean_squared_error: 0.0015 - val_loss: 0.0025 - val_mean_squared_error: 0.0025

Epoch 00033: val_loss did not improve from 0.00127
Epoch 34/100
51/51 [==============================] - 175s 3s/step - loss: 0.0021 - mean_squared_error: 0.0021 - val_loss: 0.0028 - val_mean_squared_error: 0.0028

Epoch 00034: val_loss did not improve from 0.00127
Epoch 35/100
51/51 [==============================] - 170s 3s/step - loss: 0.0035 - mean_squared_error: 0.0035 - val_loss: 0.0048 - val_mean_squared_error: 0.0048

Epoch 00035: val_loss did not improve from 0.00127
Epoch 36/100
51/51 [==============================] - 185s 4s/step - loss: 0.0016 - mean_squared_error: 0.0016 - val_loss: 0.0032 - val_mean_squared_error: 0.0032

Epoch 00036: val_loss did not improve from 0.00127
Epoch 37/100
51/51 [==============================] - 177s 3s/step - loss: 5.9038e-04 - mean_squared_error: 5.9038e-04 - val_loss: 0.0065 - val_mean_squared_error: 0.0065

Epoch 00037: val_loss did not improve from 0.00127
Epoch 38/100
51/51 [==============================] - 184s 4s/step - loss: 0.0020 - mean_squared_error: 0.0020 - val_loss: 0.0025 - val_mean_squared_error: 0.0025

Epoch 00038: val_loss did not improve from 0.00127
Epoch 39/100
51/51 [==============================] - 190s 4s/step - loss: 0.0026 - mean_squared_error: 0.0026 - val_loss: 0.0019 - val_mean_squared_error: 0.0019

Epoch 00039: val_loss did not improve from 0.00127
Epoch 40/100
51/51 [==============================] - 176s 3s/step - loss: 0.0010 - mean_squared_error: 0.0010 - val_loss: 0.0027 - val_mean_squared_error: 0.0027

Epoch 00040: val_loss did not improve from 0.00127
Epoch 41/100
51/51 [==============================] - 191s 4s/step - loss: 0.0018 - mean_squared_error: 0.0018 - val_loss: 0.0023 - val_mean_squared_error: 0.0023

Epoch 00041: val_loss did not improve from 0.00127
Epoch 42/100
51/51 [==============================] - 175s 3s/step - loss: 0.0022 - mean_squared_error: 0.0022 - val_loss: 0.0020 - val_mean_squared_error: 0.0020

Epoch 00042: val_loss did not improve from 0.00127
Epoch 43/100
51/51 [==============================] - 187s 4s/step - loss: 0.0015 - mean_squared_error: 0.0015 - val_loss: 0.0022 - val_mean_squared_error: 0.0022

Epoch 00043: val_loss did not improve from 0.00127
Epoch 44/100
51/51 [==============================] - 185s 4s/step - loss: 0.0012 - mean_squared_error: 0.0012 - val_loss: 0.0021 - val_mean_squared_error: 0.0021

Epoch 00044: val_loss did not improve from 0.00127
Epoch 45/100
51/51 [==============================] - 178s 3s/step - loss: 0.0016 - mean_squared_error: 0.0016 - val_loss: 0.0023 - val_mean_squared_error: 0.0023

Epoch 00045: val_loss did not improve from 0.00127
Epoch 46/100
51/51 [==============================] - 197s 4s/step - loss: 0.0013 - mean_squared_error: 0.0013 - val_loss: 0.0031 - val_mean_squared_error: 0.0031

Epoch 00046: val_loss did not improve from 0.00127
Epoch 47/100
51/51 [==============================] - 188s 4s/step - loss: 7.4010e-04 - mean_squared_error: 7.4010e-04 - val_loss: 7.5330e-04 - val_mean_squared_error: 7.5330e-04

Epoch 00047: val_loss improved from 0.00127 to 0.00075, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 48/100
51/51 [==============================] - 182s 4s/step - loss: 0.0011 - mean_squared_error: 0.0011 - val_loss: 0.0018 - val_mean_squared_error: 0.0018

Epoch 00048: val_loss did not improve from 0.00075
Epoch 49/100
51/51 [==============================] - 175s 3s/step - loss: 0.0010 - mean_squared_error: 0.0010 - val_loss: 0.0016 - val_mean_squared_error: 0.0016

Epoch 00049: val_loss did not improve from 0.00075
Epoch 50/100
51/51 [==============================] - 206s 4s/step - loss: 9.4849e-04 - mean_squared_error: 9.4849e-04 - val_loss: 0.0022 - val_mean_squared_error: 0.0022

Epoch 00050: val_loss did not improve from 0.00075
Epoch 51/100
51/51 [==============================] - 186s 4s/step - loss: 0.0012 - mean_squared_error: 0.0012 - val_loss: 0.0033 - val_mean_squared_error: 0.0033

Epoch 00051: val_loss did not improve from 0.00075
Epoch 52/100
51/51 [==============================] - 191s 4s/step - loss: 7.7756e-04 - mean_squared_error: 7.7756e-04 - val_loss: 0.0020 - val_mean_squared_error: 0.0020

Epoch 00052: val_loss did not improve from 0.00075
Epoch 53/100
51/51 [==============================] - 150s 3s/step - loss: 9.8262e-04 - mean_squared_error: 9.8262e-04 - val_loss: 8.5962e-04 - val_mean_squared_error: 8.5962e-04

Epoch 00053: val_loss did not improve from 0.00075
Epoch 54/100
51/51 [==============================] - 114s 2s/step - loss: 0.0011 - mean_squared_error: 0.0011 - val_loss: 0.0023 - val_mean_squared_error: 0.0023

Epoch 00054: val_loss did not improve from 0.00075
Epoch 55/100
51/51 [==============================] - 115s 2s/step - loss: 6.0990e-04 - mean_squared_error: 6.0990e-04 - val_loss: 0.0020 - val_mean_squared_error: 0.0020

Epoch 00055: val_loss did not improve from 0.00075
Epoch 56/100
51/51 [==============================] - 116s 2s/step - loss: 0.0014 - mean_squared_error: 0.0014 - val_loss: 0.0017 - val_mean_squared_error: 0.0017

Epoch 00056: val_loss did not improve from 0.00075
Epoch 57/100
51/51 [==============================] - 116s 2s/step - loss: 9.7211e-04 - mean_squared_error: 9.7211e-04 - val_loss: 0.0011 - val_mean_squared_error: 0.0011

Epoch 00057: val_loss did not improve from 0.00075
Epoch 58/100
51/51 [==============================] - 116s 2s/step - loss: 6.6581e-04 - mean_squared_error: 6.6581e-04 - val_loss: 0.0016 - val_mean_squared_error: 0.0016

Epoch 00058: val_loss did not improve from 0.00075
Epoch 59/100
51/51 [==============================] - 116s 2s/step - loss: 3.3597e-04 - mean_squared_error: 3.3597e-04 - val_loss: 6.6672e-04 - val_mean_squared_error: 6.6672e-04

Epoch 00059: val_loss improved from 0.00075 to 0.00067, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 60/100
51/51 [==============================] - 121s 2s/step - loss: 4.2309e-04 - mean_squared_error: 4.2309e-04 - val_loss: 0.0017 - val_mean_squared_error: 0.0017

Epoch 00060: val_loss did not improve from 0.00067
Epoch 61/100
51/51 [==============================] - 124s 2s/step - loss: 0.0011 - mean_squared_error: 0.0011 - val_loss: 0.0014 - val_mean_squared_error: 0.0014

Epoch 00061: val_loss did not improve from 0.00067
Epoch 62/100
51/51 [==============================] - 116s 2s/step - loss: 0.0011 - mean_squared_error: 0.0011 - val_loss: 0.0014 - val_mean_squared_error: 0.0014

Epoch 00062: val_loss did not improve from 0.00067
Epoch 63/100
51/51 [==============================] - 114s 2s/step - loss: 2.6897e-04 - mean_squared_error: 2.6897e-04 - val_loss: 0.0010 - val_mean_squared_error: 0.0010

Epoch 00063: val_loss did not improve from 0.00067
Epoch 64/100
51/51 [==============================] - 115s 2s/step - loss: 4.6159e-04 - mean_squared_error: 4.6159e-04 - val_loss: 8.4150e-04 - val_mean_squared_error: 8.4150e-04

Epoch 00064: val_loss did not improve from 0.00067
Epoch 65/100
51/51 [==============================] - 116s 2s/step - loss: 6.2465e-04 - mean_squared_error: 6.2465e-04 - val_loss: 0.0010 - val_mean_squared_error: 0.0010

Epoch 00065: val_loss did not improve from 0.00067
Epoch 66/100
51/51 [==============================] - 116s 2s/step - loss: 0.0010 - mean_squared_error: 0.0010 - val_loss: 8.8084e-04 - val_mean_squared_error: 8.8084e-04

Epoch 00066: val_loss did not improve from 0.00067
Epoch 67/100
51/51 [==============================] - 115s 2s/step - loss: 3.9217e-04 - mean_squared_error: 3.9217e-04 - val_loss: 5.1768e-04 - val_mean_squared_error: 5.1768e-04

Epoch 00067: val_loss improved from 0.00067 to 0.00052, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 68/100
51/51 [==============================] - 116s 2s/step - loss: 8.1115e-04 - mean_squared_error: 8.1115e-04 - val_loss: 0.0011 - val_mean_squared_error: 0.0011

Epoch 00068: val_loss did not improve from 0.00052
Epoch 69/100
51/51 [==============================] - 115s 2s/step - loss: 5.8176e-04 - mean_squared_error: 5.8176e-04 - val_loss: 0.0019 - val_mean_squared_error: 0.0019

Epoch 00069: val_loss did not improve from 0.00052
Epoch 70/100
51/51 [==============================] - 124s 2s/step - loss: 4.5854e-04 - mean_squared_error: 4.5854e-04 - val_loss: 0.0010 - val_mean_squared_error: 0.0010

Epoch 00070: val_loss did not improve from 0.00052
Epoch 71/100
51/51 [==============================] - 130s 3s/step - loss: 7.2133e-04 - mean_squared_error: 7.2133e-04 - val_loss: 0.0012 - val_mean_squared_error: 0.0012

Epoch 00071: val_loss did not improve from 0.00052
Epoch 72/100
51/51 [==============================] - 117s 2s/step - loss: 5.8423e-04 - mean_squared_error: 5.8423e-04 - val_loss: 0.0010 - val_mean_squared_error: 0.0010

Epoch 00072: val_loss did not improve from 0.00052
Epoch 73/100
51/51 [==============================] - 117s 2s/step - loss: 4.9557e-04 - mean_squared_error: 4.9557e-04 - val_loss: 6.8064e-04 - val_mean_squared_error: 6.8064e-04

Epoch 00073: val_loss did not improve from 0.00052
Epoch 74/100
51/51 [==============================] - 114s 2s/step - loss: 3.9693e-04 - mean_squared_error: 3.9693e-04 - val_loss: 0.0023 - val_mean_squared_error: 0.0023

Epoch 00074: val_loss did not improve from 0.00052
Epoch 75/100
51/51 [==============================] - 116s 2s/step - loss: 4.4147e-04 - mean_squared_error: 4.4147e-04 - val_loss: 9.1743e-04 - val_mean_squared_error: 9.1743e-04

Epoch 00075: val_loss did not improve from 0.00052
Epoch 76/100
51/51 [==============================] - 116s 2s/step - loss: 1.5287e-04 - mean_squared_error: 1.5287e-04 - val_loss: 0.0011 - val_mean_squared_error: 0.0011

Epoch 00076: val_loss did not improve from 0.00052
Epoch 77/100
51/51 [==============================] - 117s 2s/step - loss: 3.0437e-04 - mean_squared_error: 3.0437e-04 - val_loss: 4.4886e-04 - val_mean_squared_error: 4.4886e-04

Epoch 00077: val_loss improved from 0.00052 to 0.00045, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 78/100
51/51 [==============================] - 115s 2s/step - loss: 3.7199e-04 - mean_squared_error: 3.7199e-04 - val_loss: 0.0012 - val_mean_squared_error: 0.0012

Epoch 00078: val_loss did not improve from 0.00045
Epoch 79/100
51/51 [==============================] - 116s 2s/step - loss: 9.8590e-04 - mean_squared_error: 9.8590e-04 - val_loss: 8.3051e-04 - val_mean_squared_error: 8.3051e-04

Epoch 00079: val_loss did not improve from 0.00045
Epoch 80/100
51/51 [==============================] - 122s 2s/step - loss: 3.6600e-04 - mean_squared_error: 3.6600e-04 - val_loss: 7.1873e-04 - val_mean_squared_error: 7.1873e-04

Epoch 00080: val_loss did not improve from 0.00045
Epoch 81/100
51/51 [==============================] - 127s 2s/step - loss: 0.0011 - mean_squared_error: 0.0011 - val_loss: 0.0026 - val_mean_squared_error: 0.0026

Epoch 00081: val_loss did not improve from 0.00045
Epoch 82/100
51/51 [==============================] - 116s 2s/step - loss: 5.7984e-04 - mean_squared_error: 5.7984e-04 - val_loss: 4.4006e-04 - val_mean_squared_error: 4.4006e-04

Epoch 00082: val_loss improved from 0.00045 to 0.00044, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 83/100
51/51 [==============================] - 114s 2s/step - loss: 5.6234e-04 - mean_squared_error: 5.6234e-04 - val_loss: 0.0013 - val_mean_squared_error: 0.0013

Epoch 00083: val_loss did not improve from 0.00044
Epoch 84/100
51/51 [==============================] - 116s 2s/step - loss: 0.0010 - mean_squared_error: 0.0010 - val_loss: 9.3076e-04 - val_mean_squared_error: 9.3076e-04

Epoch 00084: val_loss did not improve from 0.00044
Epoch 85/100
51/51 [==============================] - 115s 2s/step - loss: 7.1237e-04 - mean_squared_error: 7.1237e-04 - val_loss: 0.0011 - val_mean_squared_error: 0.0011

Epoch 00085: val_loss did not improve from 0.00044
Epoch 86/100
51/51 [==============================] - 114s 2s/step - loss: 5.0589e-04 - mean_squared_error: 5.0589e-04 - val_loss: 8.1715e-04 - val_mean_squared_error: 8.1715e-04

Epoch 00086: val_loss did not improve from 0.00044
Epoch 87/100
51/51 [==============================] - 118s 2s/step - loss: 8.9290e-04 - mean_squared_error: 8.9290e-04 - val_loss: 0.0010 - val_mean_squared_error: 0.0010

Epoch 00087: val_loss did not improve from 0.00044
Epoch 88/100
51/51 [==============================] - 116s 2s/step - loss: 4.0955e-04 - mean_squared_error: 4.0955e-04 - val_loss: 8.9419e-04 - val_mean_squared_error: 8.9419e-04

Epoch 00088: val_loss did not improve from 0.00044
Epoch 89/100
51/51 [==============================] - 114s 2s/step - loss: 3.5499e-04 - mean_squared_error: 3.5499e-04 - val_loss: 3.4969e-04 - val_mean_squared_error: 3.4969e-04

Epoch 00089: val_loss improved from 0.00044 to 0.00035, saving model to /home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5
Epoch 90/100
51/51 [==============================] - 123s 2s/step - loss: 4.8108e-04 - mean_squared_error: 4.8108e-04 - val_loss: 0.0014 - val_mean_squared_error: 0.0014

Epoch 00090: val_loss did not improve from 0.00035
Epoch 91/100
51/51 [==============================] - 128s 3s/step - loss: 3.1849e-04 - mean_squared_error: 3.1849e-04 - val_loss: 9.9469e-04 - val_mean_squared_error: 9.9469e-04

Epoch 00091: val_loss did not improve from 0.00035
Epoch 92/100
51/51 [==============================] - 116s 2s/step - loss: 4.4696e-04 - mean_squared_error: 4.4696e-04 - val_loss: 0.0015 - val_mean_squared_error: 0.0015

Epoch 00092: val_loss did not improve from 0.00035
Epoch 93/100
51/51 [==============================] - 117s 2s/step - loss: 5.7423e-04 - mean_squared_error: 5.7423e-04 - val_loss: 0.0011 - val_mean_squared_error: 0.0011

Epoch 00093: val_loss did not improve from 0.00035
Epoch 94/100
51/51 [==============================] - 116s 2s/step - loss: 9.9977e-04 - mean_squared_error: 9.9977e-04 - val_loss: 0.0013 - val_mean_squared_error: 0.0013

Epoch 00094: val_loss did not improve from 0.00035
Epoch 95/100
51/51 [==============================] - 117s 2s/step - loss: 5.4168e-04 - mean_squared_error: 5.4168e-04 - val_loss: 9.5911e-04 - val_mean_squared_error: 9.5911e-04

Epoch 00095: val_loss did not improve from 0.00035
Epoch 96/100
51/51 [==============================] - 114s 2s/step - loss: 7.4066e-04 - mean_squared_error: 7.4066e-04 - val_loss: 7.4059e-04 - val_mean_squared_error: 7.4059e-04

Epoch 00096: val_loss did not improve from 0.00035
Epoch 97/100
51/51 [==============================] - 115s 2s/step - loss: 2.4963e-04 - mean_squared_error: 2.4963e-04 - val_loss: 3.5804e-04 - val_mean_squared_error: 3.5804e-04

Epoch 00097: val_loss did not improve from 0.00035
Epoch 98/100
51/51 [==============================] - 115s 2s/step - loss: 7.6301e-04 - mean_squared_error: 7.6301e-04 - val_loss: 5.6466e-04 - val_mean_squared_error: 5.6466e-04

Epoch 00098: val_loss did not improve from 0.00035
Epoch 99/100
51/51 [==============================] - 117s 2s/step - loss: 5.9426e-04 - mean_squared_error: 5.9426e-04 - val_loss: 6.3673e-04 - val_mean_squared_error: 6.3673e-04

Epoch 00099: val_loss did not improve from 0.00035
Epoch 100/100
51/51 [==============================] - 121s 2s/step - loss: 3.6254e-04 - mean_squared_error: 3.6254e-04 - val_loss: 4.6187e-04 - val_mean_squared_error: 4.6187e-04

Epoch 00100: val_loss did not improve from 0.00035

Total training time was 255 min and 48s. 
In [ ]:
# tensorboard --logdir=/home/salomonsson/Master\ Project/Graph/

Plot Error and Loss per Epoch for VIS

In [86]:
# (MSE and Loss is the same)
# Load the training history from file
log_data = pd.read_csv(log_path, sep=',', engine='python')
print(log_data.columns)

log_data = log_data.iloc[1:, :]

# Plot for the Error
plt.figure(figsize=(14,7))
plt.plot(log_data['loss'], color="black")
plt.plot(log_data['val_loss'], color="red")
plt.title("Mean Squared Error (VIS)", fontsize=20)
plt.ylabel("MSE", fontsize=17)
plt.xlabel("Epochs", fontsize=17)
plt.legend(["Training error:    " + str(log_data.iloc[-1,-3])[:8], 
            "Validation error: " + str(log_data.iloc[-1,-1])[:8]], 
           loc="upper right", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
Index(['epoch', 'loss', 'mean_squared_error', 'val_loss',
       'val_mean_squared_error'],
      dtype='object')

Display the Architecture in TensorBoard

In [ ]:
architecture.set_model(VIS_model)

# Then run "tensorboard --logdir .Graph/VIS/" in Terminal

Load the VIS Model with Least Validation Loss

In [80]:
if "Darwin" in os.uname():
    model_path = "/Users/jakob/Desktop/Master Project/Models/VIS_model.weights.best.hdf5"
    pred_path = "/Users/jakob/Desktop/Master Project/Predictions/Some_preds_VIS.txt"
else:
    model_path = "/home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5"
    pred_path = "/home/salomonsson/Master Project/Predictions/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.txt"
    
#VIS_model.load_weights(model_path)

Make predictions for VIS

In [20]:
np.set_printoptions(precision=3)
list_for_saving = []

print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
for path in range(50):
    # Load the matrix, parameters and the name. Resize the matrix
    mat, param, name = get_Matrix_and_Parameters(test_paths_VIS[path])  
    mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3), 
                                           mode='constant', preserve_range=True)
    
    # Reshape the matrix to "[batch, 3D RGB image]"
    mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
    
    # Rescale the matrix to [0,1]
    mat = normalise(mat)                                                  
    
    # Fit the scaler on the parameters
    scaled_param = MMscale_param(param, name)
    
    # Use the model to predict the parameters.
    predicted_parameters = VIS_model.predict(mat)
    
    # Use the fitted scaler to unscale the parameters to their original interval
    unscaled_pred_param = Un_scale_data(predicted_parameters, name)
    
    # Calculate the difference in percent and store the values in a list
    diff = (abs(unscaled_pred_param.reshape(1,-1)-param))/abs(unscaled_pred_param.reshape(1,-1))
    
    # Prepare for saving to file
    d1 = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
    d2 = pd.DataFrame(param.reshape(1, -1))
    d = pd.concat([d1, d2], axis=1)
    list_for_saving.append(d)
    
    # Print the results (transpose the unscaled predicted parameters for readability).
    print("Predicted: {0}, Real values: {1}, DIFF: {2}".format(np.transpose(unscaled_pred_param)[0], param, diff))
    print()


# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Pred. Temperature", "Pred Log(g)", "Pred Metallicity", 
           "True Temperature", "True Log(g)", "True Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")
print()
print("End")
Parameters are printed as [Temperature, Log(G), Metallicity]
---------------------------------------------------------------------------------------------
Predicted: [27.747 -1.144 -1.559], Real values: [27.  -1.  -2.5], DIFF: [[0.027 0.126 0.604]]

Predicted: [18.32  -3.857  0.158], Real values: [18.  -4.5 -0. ], DIFF: [[0.017 0.167 1.   ]]

Predicted: [27.356 -1.92  -0.796], Real values: [28.  -2.  -0.5], DIFF: [[0.024 0.042 0.372]]

Predicted: [28.757 -3.032 -2.436], Real values: [29.  -3.5 -2.5], DIFF: [[0.008 0.154 0.026]]

Predicted: [18.259 -4.487  1.559], Real values: [17.5 -5.  -0. ], DIFF: [[0.042 0.114 1.   ]]

Predicted: [30.021 -5.534 -2.206], Real values: [30.  -5.  -2.5], DIFF: [[0.001 0.096 0.133]]

Predicted: [12.508 -4.306  0.261], Real values: [13.  -4.5 -0. ], DIFF: [[0.039 0.045 1.   ]]

Predicted: [26.947 -5.766 -0.676], Real values: [27.  -6.  -1.5], DIFF: [[0.002 0.041 1.219]]

Predicted: [25.735 -0.719  0.315], Real values: [26.  -0.  -0.5], DIFF: [[0.01  1.    2.589]]

Predicted: [28.82  -3.287  0.393], Real values: [29.  -3.5 -0.5], DIFF: [[0.006 0.065 2.273]]

Predicted: [25.59  -3.625 -3.98 ], Real values: [26. -3. -4.], DIFF: [[0.016 0.172 0.005]]

Predicted: [26.942 -2.943 -1.358], Real values: [27.  -3.5 -2. ], DIFF: [[0.002 0.189 0.473]]

Predicted: [27.8   -2.972 -3.178], Real values: [28.  -3.  -3.5], DIFF: [[0.007 0.009 0.101]]

Predicted: [12.375 -2.342  1.973], Real values: [12.  -2.5 -0. ], DIFF: [[0.03  0.067 1.   ]]

Predicted: [26.559 -1.167 -2.176], Real values: [27. -1. -3.], DIFF: [[0.017 0.143 0.379]]

Predicted: [26.104 -1.15  -0.368], Real values: [26.  -1.   0.3], DIFF: [[0.004 0.13  1.815]]

Predicted: [26.243 -0.65  -0.589], Real values: [27. -1. -2.], DIFF: [[0.029 0.538 2.393]]

Predicted: [25.523 -4.935  1.502], Real values: [26.  -5.  -1.5], DIFF: [[0.019 0.013 1.999]]

Predicted: [28.731 -2.137 -0.54 ], Real values: [29. -2. -0.], DIFF: [[0.009 0.064 1.   ]]

Predicted: [24.302 -5.474  1.543], Real values: [25.  -5.5 -0. ], DIFF: [[0.029 0.005 1.   ]]

Predicted: [25.627 -2.203 -0.88 ], Real values: [26.  -2.   0.3], DIFF: [[0.015 0.092 1.341]]

Predicted: [27.788 -1.04  -3.252], Real values: [27.   0.5 -4. ], DIFF: [[0.028 1.481 0.23 ]]

Predicted: [28.819 -4.305  0.531], Real values: [29.  -4.5  0.3], DIFF: [[0.006 0.045 0.435]]

Predicted: [28.029 -5.08  -2.334], Real values: [28.  -4.5 -3. ], DIFF: [[0.001 0.114 0.285]]

Predicted: [24.105 -3.213  0.289], Real values: [24.  -3.5 -0. ], DIFF: [[0.004 0.089 1.   ]]

Predicted: [25.852 -4.293  0.789], Real values: [26.  -4.   0.5], DIFF: [[0.006 0.068 0.367]]

Predicted: [25.807 -1.469 -0.104], Real values: [27.  -2.  -1.5], DIFF: [[ 0.046  0.361 13.383]]

Predicted: [27.635 -4.504  0.127], Real values: [28. -5. -0.], DIFF: [[0.013 0.11  1.   ]]

Predicted: [26.939 -5.304  0.368], Real values: [27.  -5.5 -1.5], DIFF: [[2.274e-03 3.690e-02 5.079e+00]]

Predicted: [25.339 -5.382  0.878], Real values: [26.  -5.5  0.3], DIFF: [[0.026 0.022 0.658]]

Predicted: [26.105 -2.395 -0.456], Real values: [27.  -2.5 -0.5], DIFF: [[0.034 0.044 0.096]]

Predicted: [26.212 -1.644 -2.485], Real values: [26. -1. -3.], DIFF: [[0.008 0.392 0.207]]

Predicted: [26.374 -1.2    0.146], Real values: [27. -0. -1.], DIFF: [[0.024 1.    7.844]]

Predicted: [25.561 -3.608  1.877], Real values: [26.  -4.  -1.5], DIFF: [[0.017 0.109 1.799]]

Predicted: [25.641 -3.414  0.747], Real values: [26. -4. -0.], DIFF: [[0.014 0.172 1.   ]]

Predicted: [28.586 -2.812  0.827], Real values: [29. -3. -0.], DIFF: [[0.014 0.067 1.   ]]

Predicted: [22.185 -5.777 -1.218], Real values: [23.  -5.5 -0. ], DIFF: [[0.037 0.048 1.   ]]

Predicted: [27.453 -0.785 -2.866], Real values: [28. -1. -3.], DIFF: [[0.02  0.273 0.047]]

Predicted: [28.378 -0.181 -2.483], Real values: [30.  -1.  -2.5], DIFF: [[0.057 4.531 0.007]]

Predicted: [29.579 -4.053  0.106], Real values: [30.  -4.5 -0.5], DIFF: [[0.014 0.11  5.704]]

Predicted: [28.828 -2.693 -4.354], Real values: [29.  -2.5 -4. ], DIFF: [[0.006 0.072 0.081]]

Predicted: [28.837 -0.52  -0.585], Real values: [29. -1. -2.], DIFF: [[0.006 0.922 2.417]]

Predicted: [26.76  -3.334  0.93 ], Real values: [27.  -3.5 -0.5], DIFF: [[0.009 0.05  1.538]]

Predicted: [29.33  -3.985 -2.569], Real values: [29. -4. -3.], DIFF: [[0.011 0.004 0.168]]

Predicted: [26.738 -5.286  1.774], Real values: [27.  -5.5 -0.5], DIFF: [[0.01  0.04  1.282]]

Predicted: [29.676 -4.443  0.264], Real values: [30. -5. -0.], DIFF: [[0.011 0.125 1.   ]]

Predicted: [27.417 -0.553 -0.361], Real values: [28. -1. -2.], DIFF: [[0.021 0.809 4.547]]

Predicted: [22.401 -4.804  0.34 ], Real values: [22.5 -5.  -0. ], DIFF: [[0.004 0.041 1.   ]]

Predicted: [29.22  -0.291 -2.059], Real values: [30.  -1.5 -2. ], DIFF: [[0.027 4.152 0.029]]

Predicted: [29.003 -0.757 -1.604], Real values: [29.  -1.5 -2. ], DIFF: [[9.470e-05 9.811e-01 2.470e-01]]


End

Create Bar Chart

In [81]:
# Load the data first
loaded_predictions_VIS = pd.read_csv(pred_path, sep=',', engine='python')

def autolabel(rects, color='black', extra_height=0):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        height1 = height + extra_height
        ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
                '{0:2.0f}'.format(float(height)),
                ha='center', va='bottom', color=color)
        

def autolabel_2(rects, color='black', extra_height=0):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        height1 = height + extra_height
        ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
                '{0:2.1f}'.format(float(height)),
                ha='center', va='bottom', color=color)
In [87]:
# data to plot
n_groups = 10
pred_temp = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "Pred. Temperature"] * 100
true_temp = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "True Temperature"] * 100

# create plot
fig, ax = plt.subplots(figsize=(14,7))
index = np.arange(n_groups)
bar_width = 0.25
opacity = 1
 
rects1 = plt.bar(index, pred_temp, bar_width,
                 alpha=opacity,
                 color='black',
                 label='Predicted Effective Temperature')
 
rects2 = plt.bar(index + bar_width, true_temp, bar_width,
                 alpha=opacity,
                 color='red',
                 label='True Temperature')
 
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Temperature (K)', fontsize=17)
plt.title('Effective Temperature for VIS', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="lower right", fontsize=15, framealpha=0.9)
 
plt.tight_layout()
plt.show()
In [88]:
# data to plot
n_groups = 10
pred_log = np.power(10, -1*loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "Pred Log(g)"])
true_log = np.power(10, -1*loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "True Log(g)"])
 
# create plot
fig, ax = plt.subplots(figsize=(14,7))
index = np.arange(n_groups)
bar_width = 0.25
opacity = 1
 
rects1 = plt.bar(index, pred_log, bar_width,
                 alpha=opacity,
                 color='black',
                 label='Predicted Surface Gravity')
 
rects2 = plt.bar(index + bar_width, true_log, bar_width,
                 alpha=opacity,
                 color='red',
                 label='True Surface Gravity')
 
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Surface Gravity', fontsize=17)
plt.title('Surface Gravity for VIS', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="upper right", fontsize=15, framealpha=0.9)
 
autolabel(rects1)
c, extra_height = 0, 3500
for rect in rects2:
    if c == 2 or c == 7 or c == 9:
        height = rect.get_height()
        height1 = height + extra_height
        ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
                '{0:2.0f}'.format(float(height)),
                ha='center', va='bottom', color='red')
    c += 1

# Dumb solution, but it works!
c = 0
for rect in rects2:
    if c == 1 or c == 3 or c == 4 or c == 5 or c == 6 or c == 8 or c == 0:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.005*height,
                '{0:2.0f}'.format(float(height)),
                ha='center', va='bottom', color='red')
    c += 1

plt.tight_layout()
plt.show()
In [89]:
# data to plot
n_groups = 10
pred_met = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "Pred Metallicity"]
true_met = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "True Metallicity"]
 
# create plot
fig, ax = plt.subplots(figsize=(14,7))
index = np.arange(n_groups)
bar_width = 0.25
opacity = 1
 
rects1 = plt.bar(index, pred_met, bar_width,
                 alpha=opacity,
                 color='black',
                 label='Predicted Metallicity')
 
rects2 = plt.bar(index + bar_width, true_met, bar_width,
                 alpha=opacity,
                 color='red',
                 label='True Metallicity')
 
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Metallicity', fontsize=17)
plt.title('Metallicity for VIS', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="lower left", fontsize=15, framealpha=0.9)
 
autolabel_2(rects1, extra_height=-.15)
autolabel_2(rects2, 'red', extra_height=-.15)  

plt.tight_layout()
plt.show()

Evaluate the VIS Model

In [25]:
loss_and_metrics = VIS_model.evaluate_generator(my_test_batch_generator_VIS, 
                                                steps=100, 
                                                max_queue_size=1, 
                                                workers=4, 
                                                use_multiprocessing=True, 
                                                verbose=1)
/home/salomonsson/anaconda3/envs/MP2/lib/python3.6/site-packages/keras/engine/training_generator.py:290: UserWarning: Using a generator with `use_multiprocessing=True` and multiple workers may duplicate your data. Please consider using the`keras.utils.Sequence class.
  UserWarning('Using a generator with `use_multiprocessing=True`'
100/100 [==============================] - 133s 1s/step
In [26]:
print(VIS_model.metrics_names)
print(loss_and_metrics)
['loss', 'mean_squared_error']
[0.0007613567385124043, 0.0007613567385124043]

Calculate the Error over the Testing set for each one of the parameters for VIS

In [27]:
# Load the data first
loaded_predictionsVIS = pd.read_csv(pred_path, sep=',', engine='python')
In [28]:
# Remove 0-values as
loaded_predictionsVIS = loaded_predictionsVIS[loaded_predictionsVIS.loc[:, "True Log(g)"] != 0.0]
loaded_predictionsVIS = loaded_predictionsVIS[loaded_predictionsVIS.loc[:, "True Metallicity"] != 0.0]
In [29]:
# Calculate differences between the predicted and true values
loaded_predictionsVIS.loc[:, "Diff Temp (%)"] = (loaded_predictionsVIS.loc[:, "Pred. Temperature"]\
                    -loaded_predictionsVIS.loc[:, "True Temperature"])/loaded_predictionsVIS.loc[:, "Pred. Temperature"]

loaded_predictionsVIS.loc[:, "Diff Log(g) (%)"] = (loaded_predictionsVIS.loc[:, "Pred Log(g)"]\
                    -loaded_predictionsVIS.loc[:, "True Log(g)"])/loaded_predictionsVIS.loc[:, "Pred Log(g)"]

loaded_predictionsVIS.loc[:, "Diff Met (%)"] = (loaded_predictionsVIS.loc[:, "Pred Metallicity"]\
                    -loaded_predictionsVIS.loc[:, "True Metallicity"])/loaded_predictionsVIS.loc[:, "Pred Metallicity"]
In [30]:
diff_log = copy.deepcopy(loaded_predictionsVIS)
# Keep only the data within +2 to -2 of standard deviations.

diff_log = diff_log[(np.abs(diff_log["Diff Temp (%)"]-diff_log["Diff Temp (%)"].mean())\
                      <= (2*diff_log["Diff Temp (%)"].std()))]

diff_log = diff_log[(np.abs(diff_log["Diff Log(g) (%)"]-diff_log["Diff Log(g) (%)"].mean())\
                      <= (2*np.abs(diff_log["Diff Log(g) (%)"]).std()))]

diff_log = diff_log[(np.abs(diff_log["Diff Met (%)"]-diff_log["Diff Met (%)"].mean())\
                      <= (2*diff_log["Diff Met (%)"].std()))]

diff_log = diff_log[np.abs(diff_log.loc[:, "Diff Log(g) (%)"]) <= 5.0] # manually remove three values that are x7 bigger

display(diff_log.head(20))
Pred. Temperature Pred Log(g) Pred Metallicity True Temperature True Log(g) True Metallicity Diff Temp (%) Diff Log(g) (%) Diff Met (%)
2 27.356495 -1.919629 -0.795781 28.0 -2.0 -0.5 -0.023523 -0.041868 0.371686
3 28.756765 -3.032237 -2.436199 29.0 -3.5 -2.5 -0.008458 -0.154263 -0.026189
5 30.021088 -5.533601 -2.205607 30.0 -5.0 -2.5 0.000702 0.096429 -0.133475
7 26.946716 -5.765832 -0.676001 27.0 -6.0 -1.5 -0.001977 -0.040613 -1.218932
9 28.819900 -3.287172 0.392705 29.0 -3.5 -0.5 -0.006249 -0.064745 2.273220
10 25.590008 -3.624514 -3.980172 26.0 -3.0 -4.0 -0.016022 0.172303 -0.004982
11 26.942148 -2.943071 -1.357924 27.0 -3.5 -2.0 -0.002147 -0.189234 -0.472837
12 27.800482 -2.972109 -3.178014 28.0 -3.0 -3.5 -0.007177 -0.009384 -0.101317
14 26.559341 -1.167474 -2.176006 27.0 -1.0 -3.0 -0.016591 0.143450 -0.378672
15 26.104200 -1.149576 -0.368056 26.0 -1.0 0.3 0.003992 0.130114 1.815093
16 26.242569 -0.650135 -0.589465 27.0 -1.0 -2.0 -0.028863 -0.538142 -2.392907
17 25.523283 -4.935251 1.502111 26.0 -5.0 -1.5 -0.018678 -0.013120 1.998594
20 25.627298 -2.203398 -0.880310 26.0 -2.0 0.3 -0.014543 0.092311 1.340789
22 28.819471 -4.305013 0.531164 29.0 -4.5 0.3 -0.006264 -0.045293 0.435203
23 28.028702 -5.079615 -2.334192 28.0 -4.5 -3.0 0.001024 0.114106 -0.285241
25 25.851816 -4.293086 0.789351 26.0 -4.0 0.5 -0.005732 0.068269 0.366568
29 25.339212 -5.382259 0.877733 26.0 -5.5 0.3 -0.026078 -0.021876 0.658211
30 26.105270 -2.395174 -0.456296 27.0 -2.5 -0.5 -0.034274 -0.043765 -0.095781
31 26.211718 -1.644097 -2.485354 26.0 -1.0 -3.0 0.008077 0.391763 -0.207072
33 25.560596 -3.607813 1.876520 26.0 -4.0 -1.5 -0.017191 -0.108705 1.799352
In [33]:
print("VIS")
print("=================")
print()
# Print the Mean error
print("MSE for Temperature:  {0:2.6f}".format(np.power(diff_log["Diff Temp (%)"].mean(), 2)))
print("MSE for Gravity:      {0:2.6f}".format(np.power(diff_log["Diff Log(g) (%)"].mean(), 2)))
print("MSE for Metallicity:  {0:2.6f}".format(np.power(diff_log["Diff Met (%)"].mean(), 2)))
print()

# Print the median error
print("Median error for Temperature:  {0:2.6f}".format(np.abs(diff_log["Diff Temp (%)"].median())))
print("Median error for Gravity:      {0:2.6f}".format(np.abs(diff_log["Diff Log(g) (%)"].median())))
print("Median error for Metallicity:  {0:2.6f}".format(np.abs(diff_log["Diff Met (%)"].median())))
VIS
=================

MSE for Temperature:  0.000092
MSE for Gravity:      0.007013
MSE for Metallicity:  0.045567

Median error for Temperature:  0.007177
Median error for Gravity:      0.040477
Median error for Metallicity:  0.026189
In [ ]:
# Clear the session as a preperation for the next one. 
backend.clear_session()

Estimate CARMENES' parameters

Load and pre-process the normalised files

In [73]:
def get_CARMENES_norm_Info(NIR_or_VIS):
    """ Returns the file paths and names for the pre-processed Carmenes data.
        NIR_or_VIS is a string indicating the desired output ("NIR" or "VIS").
    """        
    # Get the directory for each .dat file and store them in file_paths
    # Choose directory depending on the OS I'm running 
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
    else:
        directory = "/home/salomonsson/Master Project/git/carmenes_normalised/"
    
    carmenes_norm_paths = glob(directory + "*.dat")   
    carmenes_norm_paths_NIR, carmenes_norm_paths_VIS = [], []
    for path in carmenes_norm_paths:
        if "NIR" in path.upper():
            carmenes_norm_paths_NIR.append(path)
        elif "VIS" in path.upper():
            carmenes_norm_paths_VIS.append(path)
        else:
            pass
    # Sort the file paths
    carmenes_norm_paths_NIR = sorted(carmenes_norm_paths_NIR, key=sorted_nicely_short)
    carmenes_norm_paths_VIS = sorted(carmenes_norm_paths_VIS, key=sorted_nicely_short)
    
    # Get the file names
    carmenes_norm_NAMES_NIR, carmenes_norm_NAMES_VIS = [], []
    for path in carmenes_norm_paths_NIR:
        carmenes_norm_NAMES_NIR.append(str(path)[57:-4])
    for path in carmenes_norm_paths_VIS:
        carmenes_norm_NAMES_VIS.append(str(path)[57:-4])
    
    # Return NIR or VIS paths and names
    if NIR_or_VIS.upper() == "NIR":
        return carmenes_norm_paths_NIR, carmenes_norm_NAMES_NIR
    elif NIR_or_VIS.upper() == "VIS":
        return carmenes_norm_paths_VIS, carmenes_norm_NAMES_VIS
    else:
        print('The input is not correct. Enter "VIS" or "NIR" as strings. ')

carmenes_norm_paths_NIR, carmenes_norm_NAMES_NIR = get_CARMENES_norm_Info("NIR")
carmenes_norm_paths_VIS, carmenes_norm_NAMES_VIS = get_CARMENES_norm_Info("VIS")


def create_CARMENES_MATRIX_path(name, NIR_or_VIS):
    """
        Create file path to store files. 'name' and 'NIR_or_VIS' are strings.
    """
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('Please enter "NIS" or "VIR" as second input parameter.')
        
    if "Darwin" in os.uname():
        base = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/" + tp
    else:
        base = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/" + tp
    return(base + name + ".csv")


def create_CARMENES_path(name, NIR_or_VIS):
    """ Create file path to store files"""
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('Please enter "NIS" or "VIR" as second input parameter.')
    
    if "Jakobs-MBP.home" in os.uname():
        base = "/Users/jakob/Desktop/Master Project/git/CARMENES_for_Matrices/" +tp
    else:
        base = "/home/salomonsson/Master Project/git/CARMENES_for_Matrices/" + tp
    return(base + name + ".dat")
In [74]:
print(len(carmenes_norm_paths_VIS))
print(len(carmenes_norm_paths_NIR))
549
252
In [75]:
# Make some local copies
carmenes_data_VIS = copy.deepcopy(all_loaded_data_VIS_average)
carmenes_data_NIR = copy.deepcopy(all_loaded_data_NIR_average) 
In [76]:
def get_prepared_Carmenes_for_Matrix(carmenes_data, x, VIS_or_NIR):
    """ 
        Returns the Carmenes data stored in position "number".
        NIR_or_VIS is a string, "NIR" or "VIS", indicating the desired output
        "number" is a number between 0 and 8 indicating which one of the 9 Carmenes files to return.
    """ 
    if VIS_or_NIR.upper() == "NIR":
        wave = "NIR"
    elif VIS_or_NIR.upper() == "VIS":
        wave = "VIS"
    else:
        print("Something went wrong, add VIS_or_NIR")
     
    nv, R, order = 50, 82000, 0
    name = str(carmenes_data[x][0].index.names)[-45:-10]
    
    for m in range(len(carmenes_data[x])): 
        small_list = []
        for n in range(len(carmenes_data[x][m])):
            """Perform the necessary calculations"""
                
            # Get the Angstrom value at position "n" in the Carmenes data and store it
            use_this = float(copy.deepcopy(carmenes_data[x][m].iloc[n,0]))           
            ar = carmenes_data[x][m].iloc[:,0]
                
            # Get the corresponding index
            itemindex = np.where(ar==use_this)[0]
                           
            # Convert to integers and floats
            ind = int(itemindex[int(0)])
            use_this = float(use_this)
                
            # Convolution intervals
            xsg1 = ind - nv
            xsg2 = ind + nv
            
            if xsg1 < 0:
                xsg1 = 0
            if xsg2 > len(carmenes_data[x][m]):
                xsg2 = len(carmenes_data[x][m])
            
            # Apply these intervals to the BT-Settl data
            xsgA = carmenes_data[x][m]["Angstrom"].iloc[xsg1:(xsg2)]
            xsgF = carmenes_data[x][m]["Flux"].iloc[(xsg1):xsg2]
                
            # Calculate the impact from the chosen BT-Settl flux values
            xt = copy.deepcopy(use_this)
            sgm = float(use_this/(R*2*np.sqrt(2.*np.log(2.))))
            flt = np.exp(-(xsgA - xt)**2/(2*sgm**2))/(np.sqrt(2*np.pi)*sgm)             
            
            # Make the arrays compatible for multiplication
            if len(np.diff(xsgA2)) != len(flt):
                flt = flt[:-1]
            if len(np.diff(xsgA2)) != len(xsgF):
                xsgF = xsgF[:-1]
            
            # Calculate the sum of this impact, reverse xsgF before multiplying
            xsgA2 = carmenes_data[x][m]["Angstrom"].iloc[(xsg1):(xsg2)]
            the_sum = np.sum(np.diff(xsgA2)*flt*xsgF[::-1])
                
            # Add the newly calculated flux to the Carmenes data on the appropriate position
            temp = copy.deepcopy(carmenes_data[x][m].iloc[n,:])
            temp["Flux"] = the_sum
            small_list.append(temp)
            
        # Add some space
        temp_list = []
        temp_list.append(' ')
        temp_list.append(' ')
        temp_list.append("#order: {0}".format(order))
        pd.DataFrame(temp_list).to_csv(create_CARMENES_path(name, wave), 
                                        header=False, index=False, mode="a")
        order += 1
            
        # Create the dataframe
        df = pd.DataFrame(small_list)
            
        # Convert the df to numerical values
        df = df.apply(pd.to_numeric, errors='ignore')  
            
        # Calculate the Rolling mean for the Flux and equal the "area below" to 1.
        temp_df = (df["Flux"].rolling(2).mean()[1:]*(np.diff(df["Angstrom"])))
        df["Flux"] = df["Flux"]/temp_df.sum()
            
        # Write to file
        df.to_csv(create_CARMENES_path(name, wave), header=False, index=False, mode="a")
In [291]:
# Create CARMENES VIS files
#for i in range(len(carmenes_data_VIS)):
    get_prepared_Carmenes_for_Matrix(carmenes_data_VIS, i, "VIS")
In [294]:
# Create CARMENES NIR files
#for i in range(len(carmenes_data_NIR)):
    get_prepared_Carmenes_for_Matrix(carmenes_data_NIR, i, "NIR")
In [86]:
def get_CARMENES_READY_Info(NIR_or_VIS):
    """ Returns the file paths and names for the pre-processed CARMENES data. 
        'NIR_or_VIS' is a string, either 'NIR' or 'VIS'. """
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('The input is not correct. Enter "VIS" or "NIR" as strings. ')
    
    # Get the directory for each .dat file and store them in file_paths
    # Choose directory depending on the OS I'm running 
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/git/CARMENES_for_Matrices/" + tp
    else:
        directory = "/home/salomonsson/Master Project/git/CARMENES_for_Matrices/" + tp
    
    carmenes_READY_paths = glob(directory + "*.dat")            

    # Get the .dat file names
    carmenes_READY_names = []
    for root, dirs, files in os.walk(directory):                                 
        for filename in files:
            if filename.endswith(".dat"):
                filename = filename[:-4]                     # Remove unneccessary "common" information from the name
                carmenes_READY_names.append(filename)
    return carmenes_READY_names, carmenes_READY_paths
    

# This function needed to be rewritten and changed even though it does the same thing on very similar input data as the 
# "load_all_data" function used in the very beginning of this document. I havent found a reason for why finding all 
# the "#" didnt worked as before. 
def load_CARMENES_READY_data(name):
    """ Returns the btsettl dataframe with the name "name". """
    if "NIR" in name.upper():
        paths = carmenes_READY_paths_NIR
    elif "VIS" in name.upper():
        paths = carmenes_READY_paths_VIS
    else:
         print('Input is incorrect. Enter a file name.')  
            
    for path in paths:
        if name in path:
            data = pd.read_table(path, sep=",", header=None, skiprows=3)
            data.index.names = [name]                               # Add the file name to the dataframe  
            #limits = (data[data.values == '#'])                    # Find all the indeces with a "#" (Not working!)
            limits = {'# order: 0': 0}
            for i in range(len(data)):
                if "#" in data.iloc[i,0]:
                    limits[data.iloc[i,0]] = data.index[i]
                    
            keys = [key for key, value in limits.items()]
            values = [int(value) for key, value in limits.items()]            
            data_pieces, count = [], 0
            for i in range(len(limits)):
                start = values[count]
                if count == (len(limits)-1):
                    data_piece = data.iloc[start:,:]        # The last data piece contains the last order in the df data
                else:
                    data_piece = data.iloc[start:values[count+1],:]  # Select the correct data piece
                data_piece.index.names = [str(data_piece.index.names)[2:-2] + "_Order_" + str(count)] # add order to name
                data_piece.columns = ["Angstrom", "Flux"]
                data_piece = data_piece.dropna(inplace=False)                      # Remove NaN-values
                #data_piece.dropna(inplace=True)     # Not like this to avoid a "SettingWithCopyWarning" error
                data_pieces.append(data_piece)
                count += 1
    return data_pieces


#test_data = load_CARMENES_READY_data(carmenes_READY_names_NIR[0])

Create Power Matrices

NIR

In [87]:
# A minor change compared to previous "get_POWER_matrix" where 
# the matrices have to be Resized to match for concatenation
def get_FULL_POWER_matrix_Carmenes(data):
    """ Returns the power matrix, the scale indices and the Fourier frequencies of "data". 
        All orders included.
    """
    power_matrix, full_scales, full_freqs = np.array([1]), np.array([1]), np.array([1])
    for i in range(len(data)):
        power, scales, freqs = get_POWER_matrix(data[i].loc[:, "Flux"])
        if len(power_matrix) > 10:                                # if no power matrix is created
            
            # Resize "power" to match "power_matrix"'s dimensions.
            power = skimage.transform.resize(power, (power.shape[0], power_matrix.shape[1]), 
                                           mode='constant', preserve_range=True)
            
            power_matrix = np.append(power_matrix, power, axis=0)
            full_scales = np.append(full_scales, scales, axis=0)
            full_freqs = np.append(full_freqs, freqs, axis=0)
        else:        
            power_matrix = np.array(power)
            full_scales = np.array(scales)
            full_freqs = np.array(freqs)
    return power_matrix, full_scales, full_freqs
In [88]:
# Make sure the equal amount of orders exists in all NIR and VIS files respectively.
NIR_OK, VIS_OK = "NO", "NO"
if check_orders(all_loaded_data_NIR) == True:
    NIR_OK = "YES"
if check_orders(all_loaded_data_VIS) == True:
    VIS_OK = "YES"
Number of orders is equal
Number of orders is equal
In [89]:
### Each NIR file takes some 20 seconds to process. 335 min in total ###

carmenes_READY_names_NIR, carmenes_READY_paths_NIR = get_CARMENES_READY_Info("NIR")

if "Darwin" in os.uname():
    d = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/NIR/"
else:
    d = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/NIR/"
for dirpath, dirnames, files in os.walk(d):   # Get the amount of NIR files in the directory d
    nir_files = []
    for f in files:
        if "NIR" in f.upper():
            nir_files.append(f) 
            
if len(nir_files) < len(carmenes_READY_names_NIR):   # if all the VIS files havent been "converted" to power matrices 
    start1 = time.time()
    if NIR_OK == "YES":
        for i in range(len(carmenes_READY_names_NIR)):      #len(btsettl_READY_names_NIR)
            NIR_name = carmenes_READY_names_NIR[i]
            NIR_d = load_CARMENES_READY_data(NIR_name)
            NIR_m = get_FULL_POWER_matrix_Carmenes(NIR_d)[0]     # Get the Flux's power matrix
            NIR_path = create_CARMENES_MATRIX_path(NIR_name, 'NIR')
            pd.DataFrame(NIR_m, dtype='h').to_csv(NIR_path, header=True, index=False, mode="w")  # dtype = short
    
    # Check how long time it took.
    end1 = time.time()
    tt1 = end1 - start1
    print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
    print("All the power matrices for the CARMENES NIR files are already in the directory")  
All the power matrices for the CARMENES NIR files are already in the directory

VIS

In [90]:
### Each VIS file takes some 30 seconds to process. ###

carmenes_READY_names_VIS, carmenes_READY_paths_VIS = get_CARMENES_READY_Info("VIS")

if "Darwin" in os.uname():
    d = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/VIS/"
else:
    d = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/VIS/"  
for dirpath, dirnames, files in os.walk(d):        # Get the amount of VIS files in the directory d
    vis_files = []
    for f in files:
        if "VIS" in f.upper():
            vis_files.append(f) 

if len(vis_files) < len(carmenes_READY_names_VIS):   # if all the VIS files havent been "converted" to power matrices 
    start1 = time.time()
    if VIS_OK == "YES":
        for i in range(len(carmenes_READY_names_VIS)):
            VIS_name = carmenes_READY_names_VIS[i]
            VIS_d = load_CARMENES_READY_data(VIS_name)
            VIS_m = get_FULL_POWER_matrix_Carmenes(VIS_d)[0]
            VIS_path = create_CARMENES_MATRIX_path(VIS_name, 'VIS')
            pd.DataFrame(VIS_m, dtype='h').to_csv(VIS_path, header=True, index=False, mode="w")  # dtype = short

    # Check how long time it took.
    end1 = time.time()
    tt1 = end1 - start1
    print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
    print("All the power matrices for the CARMENES VIS files are already in the directory")
All the power matrices for the CARMENES VIS files are already in the directory
In [35]:
def get_MATRIX_Carmenes_paths(NIR_or_VIS):
    """ Return all the matrix paths for 'NIR' or 'VIS'. """
    if NIR_or_VIS.upper() == "NIR":
        tp = "NIR/"
    elif NIR_or_VIS.upper() == "VIS":
        tp = "VIS/"
    else:
        print('Please enter "NIS" or "VIR".')
        
    if "Darwin" in os.uname():
        directory = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/" + tp
    else:
        directory = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/" + tp
    return glob(directory + "*.csv")


def get_Carmenes_Matrix(path):
    """ Returns the matrix and its name stored in 'path'.
    """
    matrix = pd.read_csv(path)     # load the power matrix        
    filename = os.path.basename(path)[:-4]
    return matrix.values, filename
In [32]:
# Create MinMax scaler to sunscale the data
param_scaler = MinMaxScaler(feature_range=(0,1), copy=True)
mat, param, name = get_Matrix_and_Parameters(get_MATRIX_paths("NIR")[0])
print(param)
scaled_param = param_scaler.fit_transform(param.reshape(-1,1))
print(scaled_param)
[29.  -1.5 -3. ]
[[1.      ]
 [0.046875]
 [0.      ]]

Make predictions

NIR

In [45]:
# Load the best performing NIR model

if "Darwin" in os.uname():
    model_path = "/Users/jakob/Desktop/Master Project/Models/NIR_100epochs_Matrix_DivOn2_Sequence.hdf5"
    pred_path = "/Users/jakob/Desktop/Master Project/Predictions/CARMENES/NIR_predictions_CARMENES.xlsx"
else:
    start = "/home/salomonsson/Master Project/Best_models/NIR/"
    model_path = start+"NIR_100epochs_Matrix_DivOn2_Sequence_MAIN_FINAL_MODEL/NIR_100epochs_Matrix_DivOn2_Sequence.hdf5"
    pred_path = "/home/salomonsson/Master Project/Predictions/CARMENES/NIR_predictions_CARMENES.xlsx"
    
NIR_model.load_weights(model_path)
In [46]:
np.set_printoptions(precision=3)
list_for_saving = []
down_size = 2
carMat_paths = get_MATRIX_Carmenes_paths("NIR")


print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
print(" ")
for path in range(9):
    # Load the matrix, parameters and the name. Resize the matrix
    mat, name = get_Carmenes_Matrix(carMat_paths[path])  
    mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3), 
                                           mode='constant', preserve_range=True)
    
    # Reshape the matrix to "[batch, 3D RGB image]"
    mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
    
    # Rescale the matrix to [0,1]
    mat = normalise(mat)                                                  
    
    # Use the model to predict the parameters.
    predicted_parameters = NIR_model.predict(mat)

    # Use the fitted scaler to unscale the parameters to their original interval
    unscaled_pred_param = param_scaler.inverse_transform(predicted_parameters)
    
    # Prepare for saving to file
    d = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
    dfName = pd.DataFrame([name])
    d1 = pd.concat([dfName, d], axis=1)
    #list_for_saving.append(pd.DataFrame([name]))
    list_for_saving.append(d1)
    
    # Print the results (transpose the unscaled predicted parameters for readability).
    print("Matrix:               {0}".format(name))
    print("Predicted Parameters: {0}".format(unscaled_pred_param[0]))
    print()


# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Matrix name", "Predicted Temperature", "Predicted Log(g)", "Predicted Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")

print()
print("End")
Parameters are printed as [Temperature, Log(G), Metallicity]
---------------------------------------------------------------------------------------------
 
Matrix:               car-20170520T20h38m14s-sci-gtoc-nir
Predicted Parameters: [22.604 -1.866 -1.051]

Matrix:               car-20170825T00h06m21s-sci-gtoc-nir
Predicted Parameters: [24.183 -1.712 -1.891]

Matrix:               car-20170609T20h33m03s-sci-gtoc-nir
Predicted Parameters: [24.106 -1.674 -1.813]

Matrix:               car-20170912T02h41m57s-sci-gtoc-nir
Predicted Parameters: [23.55  -1.765 -1.283]

Matrix:               car-20170914T03h24m58s-sci-gtoc-nir
Predicted Parameters: [23.672 -1.678 -1.466]

Matrix:               car-20170822T01h54m18s-sci-gtoc-nir
Predicted Parameters: [23.559 -1.757 -1.293]

Matrix:               car-20170913T21h52m13s-sci-gtoc-nir
Predicted Parameters: [24.115 -1.646 -1.336]

Matrix:               car-20170924T20h42m07s-sci-gtoc-nir
Predicted Parameters: [24.013 -1.686 -1.212]

Matrix:               car-20170911T01h42m21s-sci-gtoc-nir
Predicted Parameters: [23.547 -1.765 -1.287]


End

VIS

In [52]:
# Load best performing NIR model

if "Darwin" in os.uname():
    model_path = "/Users/jakob/Desktop/Master Project/Models/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.hdf5"
    pred_path = "/Users/jakob/Desktop/Master Project/Predictions/CARMENES/VIS_predictions_CARMENES.xlsx"
else:
    start = "/home/salomonsson/Master Project/Best_models/VIS/"
    model_path = start+"VIS_150epochs_Matrix_DivOn3_batch2_Sequence/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.hdf5"
    pred_path = "/home/salomonsson/Master Project/Predictions/CARMENES/VIS_predictions_CARMENES.xlsx"
    
VIS_model.load_weights(model_path)
In [53]:
mat, name = get_Carmenes_Matrix(carMat_paths[0]) 
In [54]:
mat.shape
Out[54]:
(8145, 2871)
In [55]:
7747//down_size
Out[55]:
2582
In [56]:
np.set_printoptions(precision=3)
list_for_saving = []
down_size = 3
carMat_paths = get_MATRIX_Carmenes_paths("VIS")


print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
print(" ")
for path in range(9):
    # Load the matrix, parameters and the name. Resize the matrix to fit into the trained model
    mat, name = get_Carmenes_Matrix(carMat_paths[path])  
    mat = skimage.transform.resize(mat, (7747//down_size, mat.shape[1]//down_size, 3), 
                                           mode='constant', preserve_range=True)
    
    # Reshape the matrix to "[batch, 3D RGB image]"
    mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
    
    # Rescale the matrix to [0,1]
    mat = normalise(mat)                                                  
    
    # Use the model to predict the parameters.
    predicted_parameters = VIS_model.predict(mat)

    # Use the fitted scaler to unscale the parameters to their original interval
    unscaled_pred_param = param_scaler.inverse_transform(predicted_parameters)
    
    # Prepare for saving to file
    d = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
    dfName = pd.DataFrame([name])
    d1 = pd.concat([dfName, d], axis=1)
    list_for_saving.append(d1)
    
    # Print the results (transpose the unscaled predicted parameters for readability).
    print("Matrix:               {0}".format(name))
    print("Predicted Parameters: {0}".format(unscaled_pred_param[0]))
    print()


# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Matrix name", "Predicted Temperature", "Predicted Log(g)", "Predicted Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")

print()
print("End")
Parameters are printed as [Temperature, Log(G), Metallicity]
---------------------------------------------------------------------------------------------
 
Matrix:               car-20170924T20h42m07s-sci-gtoc-vis
Predicted Parameters: [28.125 -2.097  0.359]

Matrix:               car-20170914T03h24m58s-sci-gtoc-vis
Predicted Parameters: [27.828 -1.771  1.118]

Matrix:               car-20170825T00h06m21s-sci-gtoc-vis
Predicted Parameters: [27.961 -2.165  0.356]

Matrix:               car-20170913T21h52m13s-sci-gtoc-vis
Predicted Parameters: [27.421 -2.103  2.603]

Matrix:               car-20170520T20h38m14s-sci-gtoc-vis
Predicted Parameters: [27.908 -1.994  0.131]

Matrix:               car-20170912T02h41m57s-sci-gtoc-vis
Predicted Parameters: [28.133 -2.219  0.437]

Matrix:               car-20170822T01h54m18s-sci-gtoc-vis
Predicted Parameters: [28.448 -1.895 -0.316]

Matrix:               car-20170609T20h33m03s-sci-gtoc-vis
Predicted Parameters: [27.823 -1.775  0.347]

Matrix:               car-20170911T01h42m21s-sci-gtoc-vis
Predicted Parameters: [28.537 -1.861 -0.65 ]


End